In [1]:
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever

## Text Splitting & Docloader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.document_loaders import TextLoader
import chromadb

import time



### Load the BGE small embeddings

In [2]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'},
    encode_kwargs=encode_kwargs
)

### Load OpenAI embeddings

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
from langchain_openai import OpenAIEmbeddings
model_name = "text-embedding-3-small"

openai_embeddings = OpenAIEmbeddings(model=model_name, allowed_special={'<|endoftext|>', '<|endofprompt|>'})

## Load dataset

This dataset has 108 arxiv papers with content parsed using Meta's Nougat model

In [5]:
from datasets import load_dataset
from datasets import Dataset
import pandas as pd

# Load the Hugging Face dataset
dataset = load_dataset("deep-learning-analytics/arxiv_small_nougat")

# Convert to a Pandas DataFrame
df = Dataset.to_pandas(dataset['train'])

# Preview the first few rows
df.head()

Unnamed: 0,doi,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references,content,noref_content
0,2206.02336,2206.02336,Making Large Language Models Better Reasoners ...,Few-shot learning is a challenging task that r...,http://arxiv.org/pdf/2206.02336,['Yifei Li' 'Zeqi Lin' 'Shizhuo Zhang' 'Qiang ...,['cs.CL' 'cs.AI'],,,cs.CL,20220606,20230524,"\n\n* D. Andor, L. He, K. Lee, and E. Pitler (...",# Making Large Language Models Better Reasoner...,# Making Large Language Models Better Reasoner...
1,2206.04615,2206.04615,Beyond the Imitation Game: Quantifying and ext...,Language models demonstrate both quantitative ...,http://arxiv.org/pdf/2206.04615,['Aarohi Srivastava' 'Abhinav Rastogi' 'Abhish...,['cs.CL' 'cs.AI' 'cs.CY' 'cs.LG' 'stat.ML'],"27 pages, 17 figures + references and appendic...","Transactions on Machine Learning Research, May...",cs.CL,20220609,20230612,"\n\n* Wikiquote et al. (2021) Wikiquote, russi...",# Beyond the Imitation Game: Quantifying and e...,# Beyond the Imitation Game: Quantifying and e...
2,2206.05229,2206.05229,Measuring the Carbon Intensity of AI in Cloud ...,By providing unprecedented access to computati...,http://arxiv.org/pdf/2206.05229,['Jesse Dodge' 'Taylor Prewitt' 'Remi Tachet D...,['cs.LG'],"In ACM Conference on Fairness, Accountability,...",,cs.LG,20220610,20220610,\n\n* (1)\n* Anthony et al. (2020) Lasse F. Wo...,[MISSING_PAGE_EMPTY:1]\n\nIntroduction\n\nClim...,[MISSING_PAGE_EMPTY:1]\n\nIntroduction\n\nClim...
3,2206.05802,2206.05802,Self-critiquing models for assisting human eva...,We fine-tune large language models to write na...,http://arxiv.org/pdf/2206.05802,['William Saunders' 'Catherine Yeh' 'Jeff Wu' ...,['cs.CL' 'cs.LG'],,,cs.CL,20220612,20220614,"(RLHP) has become more common [1, 2, 3, 4], d...",# Self-critiquing models for assisting human e...,# Self-critiquing models for assisting human e...
4,2206.06336,2206.06336,Language Models are General-Purpose Interfaces,Foundation models have received much attention...,http://arxiv.org/pdf/2206.06336,['Yaru Hao' 'Haoyu Song' 'Li Dong' 'Shaohan Hu...,['cs.CL'],32 pages. The first three authors contribute e...,,cs.CL,20220613,20220613,"\n\n* Agrawal et al. (2019) Harsh Agrawal, Kar...",# Language Models are General-Purpose Interfac...,# Language Models are General-Purpose Interfac...


### Select subset of data to load into a database

* Our key text column will be the `noref_content` which has the content of the paper without the references
* We will include some metadata fields as well

In [6]:
df.columns
keep_cols = ['id', 'title', 'authors', 'summary', 'source', 'published', 'noref_content']
df_subset = df[keep_cols]

In [7]:
df_subset= df_subset.dropna()
df_subset['noref_content'] = df_subset['noref_content'].str.replace('[^\w\s]', '', regex=True)

df_subset.shape

(101, 7)

### Load Documents using Langchain

In [8]:
from langchain.document_loaders import DataFrameLoader
loader = DataFrameLoader(df_subset, page_content_column="noref_content")
docs = loader.load()

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
text_splitter  = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=[",", "\n"])
split_docs = text_splitter.split_documents(docs)

In [10]:
print("Num Split Docs: ", len(split_docs))

Num Split Docs:  7596


In [11]:
split_docs[10]

Document(page_content='To obtain the steplevel labels ie textlabel_ij for negative training data with wrong answers we design an algorithm that compares intermediate results among steps in positivenegative reasoning paths Figure 3 illustrates this algorithm This algorithm can not only work on math word problems but also generalize to other reasoning tasks we use an offtheshelf natural language inference model _robertalargemnli_Liu et al 2019 to check whether two reasoning steps are semantically equivalent or not Given a reasoning step if we cannot find any semantically equivalent step in the positive reasoning paths we label it and all the subsequent steps as negative steps\n\n 3 Experimental Setup\n\n Reasoning Tasks\n\nArithmetic ReasoningFollowing Wang et al 2022c we use AsDiv Miao et al 2020 SingleEq KoncelKedziorski et al 2015 MultiArith Roy and Roth 2015 SVAMP Patel et al 2021 and GSM8K Cobbe et al 2021', metadata={'id': 2206.02336, 'title': 'Making Large Language Models Better R

### Setup the ChromaDB vector store

In [12]:
start_time = time.time()
### To create a new index
# db_bge = Chroma.from_documents(split_docs, bge_embeddings, persist_directory="./chroma_db_bge")
### To load an already existing index
db_bge = Chroma(persist_directory="./chroma_db_bge", embedding_function=bge_embeddings)
print("Time taken to load BGE Embeddings: ", time.time() - start_time)

Time taken to load BGE Embeddings:  0.32668113708496094


### Setup OpenAI Embedding based Vector Store

In [13]:
start_time = time.time()
# db_openai = Chroma.from_documents(split_docs, openai_embeddings, persist_directory="./chroma_db_openai")
### To load an already existing index
db_openai = Chroma(persist_directory="./chroma_db_openai", embedding_function=openai_embeddings)
print("Time taken to load OpenAI Embeddings: ", time.time() - start_time)

Time taken to load OpenAI Embeddings:  0.006594181060791016


Load time comparison:

* Time taken to load BGE Embeddings:  352.92843294143677
* Time taken to load OpenAI Embeddings:  48.999374866485596


In [14]:
print("There are", db_openai._collection.count(), "in the OpenAI collection")
print("There are", db_bge._collection.count(), "in the BGE collection")

There are 7596 in the OpenAI collection
There are 7596 in the BGE collection


### Test both the DBs

#### BGE top 3 retrieved results

In [15]:
query = "What is RLHF? When can it be used?"
matched_docs = db_bge.similarity_search(query, k=8)

# print results
for index, value in enumerate(matched_docs):
    pos = index+1
    if index <=3:
        print(f"Matched do from BGE Embeddings at {pos} is : ", matched_docs[index].page_content, "/n ===========")

Matched do from BGE Embeddings at 1 is :  All our RL runs used the same hyperparameters as our prior work Bai et al 2022 However there are some differences The RLHF models for our earlier paper are finetuned from contextdistilled models while our current RLHF models are finetuned directly from pretrained models We didnt see much benefit to using context distillation since the improvement from RL was much more significant Furthermore the pretrained LMs that we use for all our runs have been improved since the prior work

For PM comparison data we used 135296 HF helpfulness comparisons and 182831 constitutionallygenerated harmlessness comparisons one comparison generated for each SLCAI prompt For the purpose of doing controlled tests all the RL runs in this paper use the same set of training prompts which consists of all the HF and modelgenerated prompts used for SLCAI Section 32 plus _additional_ modelgenerated prompts 491142 for red team and 474300 for helpfulness

Matched do from BGE 

#### OpenAI top 3 retrieved results

In [16]:
query = "What is RLHF? When can it be used?"
matched_docs = db_openai.similarity_search(query, k=8)

# print results
for index, value in enumerate(matched_docs):
    pos = index+1
    if index <=3:
        print(f"Matched do from OpenAI Embeddings at {pos} is : ", matched_docs[index].page_content, "/n ===========")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Matched do from OpenAI Embeddings at 2 is :  We examine the influence of the amount of RLHF training for two reasons First RLHF 13 57 is an increasingly popular technique for reducing harmful behaviors in large language models 3 21 52 Some of these models are already deployed 52 so we believe the impact of RLHF deserves further scrutiny Second previous work shows that the amount of RLHF training can significantly change metrics on a wide range of personality political preference and harm evaluations for a given model size 41 As a result it is important to control for the amount of RLHF training in the analysis of our experiments

 Experiments

Matched do from OpenAI Embeddings at 3 is :  RLHF has emerged as a powerful strategy for finetuning Large Language Models enabling significant improvements in their performance Christiano et al 2017 The method first showcased by Stiennon et al 2020 in the context of textsummarization tasks has since been extended to a range of other applications 

### Evaluation using Trulens

In [26]:
from openai import OpenAI
import os
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
tru = Tru()

### Define a RAG class with trulens embedded in it

In [27]:
class RAG:
    def __init__(self, db):
        self.db = db
        self.client = OpenAI()
        self.messages = []
        self.messages.append({"role": "system", "content":"You are a friendly assistant who uses the provided context to answer the user's query"})

    @instrument
    def retrieve(self, query: str) -> list:
        matched_docs =self.db.similarity_search(query, k=6)
        context_list = [doc.page_content for doc in matched_docs]
        # context = '\n'.join(context_list)
        return context_list
    
    @instrument
    def generate_completion(self, query: str, context: list) -> str:
        self.messages.append({"role": "user", "content": f"Use the provided context to answer user's query. \n Context: {context} \n Query: {query} \n Answer:"})
        completion = self.client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=self.messages,
        temperature=0.4,
        )
        result = completion.choices[0].message.content
        return result
    
    @instrument
    def query(self, query: str) -> str:
        context_str = self.retrieve(query)
        completion = self.generate_completion(query, context_str)
        return completion

### Test responses using the 2 models

In [28]:
rag_bge = RAG(db_bge)
query = "What is RLHF? When can it be used?"
openai_response = rag_bge.query(query)
print(openai_response)

RLHF stands for Reinforcement Learning from Human Feedback. It is a strategy for finetuning Large Language Models (LLMs) based on feedback from human users, aligning the model's responses more closely with human expectations and preferences. RLHF can be used to train models to be both helpful and harmless without human feedback labels for harmlessness. It can also be used to improve model performance in various language generation tasks, such as text summarization, training more helpful, harmless, and accurate assistants, and teaching models to use external tools for tasks like question answering and providing references.


In [29]:
rag_openai = RAG(db_openai)
query = "What is RLHF? When can it be used?"
openai_response = rag_openai.query(query)
print(openai_response)



RLHF stands for Reinforcement Learning from Human Feedback, and it is a technique used to align large language models (LLMs) with human preferences in order to make them more useful. RLHF works by using a pretrained language model to generate text, which is then evaluated by humans to learn a reward model that captures human preferences when judging model output. RLHF can be used directly on top of a general-purpose language model pretrained via self-supervised learning. However, for more complex tasks, RLHF is typically applied after an initial supervised fine-tuning phase using a small number of expert demonstrations for the corresponding downstream task. It has been shown to be effective in finetuning LLMs for tasks such as text summarization, addressing issues with factuality, toxicity, and helpfulness, and aligning model responses more closely with human expectations and preferences.


### Evaluate this response using Trulens

#### Setup feedback functions

In [30]:
from trulens_eval import Feedback, Select
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI

import numpy as np

# Initialize provider class
fopenai = fOpenAI()

grounded = Groundedness(groundedness_provider=fopenai)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = (
    Feedback(fopenai.relevance_with_cot_reasons, name = "Answer Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(fopenai.qs_relevance_with_cot_reasons, name = "Context Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on(Select.RecordCalls.retrieve.rets.collect())
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.retrieve.rets.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.app.retrieve.args.query .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.app.retrieve.args.query .
✅ In Context Relevance, input statement will be set to __record__.app.retrieve.rets.collect() .


### Construct the app
Wrap the custom RAG with TruCustomApp, add list of feedbacks for eval

In [31]:
from trulens_eval import TruCustomApp
tru_rag_bge = TruCustomApp(rag_bge,
    app_id = 'RAG BGE',
    feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance])

tru_rag_openai = TruCustomApp(rag_openai,
    app_id = 'RAG OpenAI',
    feedbacks = [f_groundedness, f_qa_relevance, f_context_relevance])

Function <function RAG.generate_completion at 0x2c423e480> was not found during instrumentation walk. Make sure it is accessible by traversing app <__main__.RAG object at 0x38fd91110> or provide a bound method for it as TruCustomApp constructor argument `methods_to_instrument`.
Function <function RAG.query at 0x2c423e520> was not found during instrumentation walk. Make sure it is accessible by traversing app <__main__.RAG object at 0x38fd91110> or provide a bound method for it as TruCustomApp constructor argument `methods_to_instrument`.
Function <function RAG.retrieve at 0x2c423e3e0> was not found during instrumentation walk. Make sure it is accessible by traversing app <__main__.RAG object at 0x38fd91110> or provide a bound method for it as TruCustomApp constructor argument `methods_to_instrument`.
Function <function RAG.generate_completion at 0x2c423e480> was not found during instrumentation walk. Make sure it is accessible by traversing app <__main__.RAG object at 0x38fddaf10> or p

### Define a set of queries and run them through both the models

In [32]:
queries = ["What is RLHF? When can it be used?", "How can the climate change impact of LLMs be estimated?", "Tell me more about the scaling laws of LLM training", "Explain diffusion models. What variants of them are there?",
           "What kind of model is Stable Diffusion?", "Are their LLMs with Retrievel Augmented generation built in them?", "Show me different methods of prompting the LLMs and compare their benefits", "What is a Collaborative Language Model",
           "Tell me about Audio based LLMs. Can some LLMs understand audio directly?", "Explain how multi modal LLMs work"]

In [33]:
for sample_query in queries:
    with tru_rag_bge as recording:
        rag_bge.query(sample_query)
    with tru_rag_openai as recording:
        rag_openai.query(sample_query)



In [35]:
from trulens_eval import Tru
tru = Tru()
tru.get_leaderboard(app_ids=["RAG BGE", "RAG OpenAI"])
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Network URL: http://192.168.1.147:8501



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>