In [1]:
import os
import openai
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from operator import itemgetter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import FAISS

In [2]:
load_dotenv()

# Uncomment to use the default DaVinci Model 
# llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))
# model = 'text-davinci-003'

# Uncomment to use the gpt-3.5-turbo-instruct model 
llm = OpenAI(model_name='gpt-3.5-turbo-instruct', openai_api_key = os.getenv("OPENAI_API_KEY"))
model = 'gpt-3.5-turbo-instruct'

In [4]:
import ir_datasets


dataset = ir_datasets.load("beir/arguana")


queries = {}
for query in dataset.queries_iter():
    queries[query.query_id] = {"text":query.text}

docs = {}
count = 0
for doc in dataset.docs_iter():
    docs[doc.doc_id] = {"text": doc.text}
    count += 1
    if count >= 2000:
        break

rel_set = {}
for qrel in dataset.qrels_iter():
    if qrel.query_id not in rel_set:
        rel_set[qrel.query_id] = []
    if qrel.relevance > 0: 
        rel_set[qrel.query_id].append(qrel.doc_id)


In [5]:
queries['test-environment-aeghhgwpe-pro02a'], len(queries)

({'text': "Being vegetarian helps the environment  Becoming a vegetarian is an environmentally friendly thing to do. Modern farming is one of the main sources of pollution in our rivers. Beef farming is one of the main causes of deforestation, and as long as people continue to buy fast food in their billions, there will be a financial incentive to continue cutting down trees to make room for cattle. Because of our desire to eat fish, our rivers and seas are being emptied of fish and many species are facing extinction. Energy resources are used up much more greedily by meat farming than my farming cereals, pulses etc. Eating meat and fish not only causes cruelty to animals, it causes serious harm to the environment and to biodiversity. For example consider Meat production related pollution and deforestation  At Toronto’s 1992 Royal Agricultural Winter Fair, Agriculture Canada displayed two contrasting statistics: “it takes four football fields of land (about 1.6 hectares) to feed each C

In [36]:
#### API CALL WARNING ####

client = openai.OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    if response and hasattr(response, 'data') and response.data:
        embedding = response.data[0].embedding
        return embedding
    else:
        print("Invalid response or no embedding data received.")
        return None

In [6]:
docs['test-environment-aeghhgwpe-pro02b']

{'text': "You don’t have to be vegetarian to be green. Many special environments have been created by livestock farming – for example chalk down land in England and mountain pastures in many countries. Ending livestock farming would see these areas go back to woodland with a loss of many unique plants and animals. Growing crops can also be very bad for the planet, with fertilisers and pesticides polluting rivers, lakes and seas. Most tropical forests are now cut down for timber, or to allow oil palm trees to be grown in plantations, not to create space for meat production.  British farmer and former editor Simon Farrell also states: “Many vegans and vegetarians rely on one source from the U.N. calculation that livestock generates 18% of global carbon emissions, but this figure contains basic mistakes. It attributes all deforestation from ranching to cattle, rather than logging or development. It also muddles up one-off emissions from deforestation with on-going pollution.”  He also ref

In [39]:
for idx, query in tqdm(queries.items(), desc = 'Generating Embeddings'):
    query_text = query['text']
    queries[idx] = {'text': query_text, 'embedding': get_embedding(query_text)}

Generating Embeddings: 100%|███████████████████████████████████████| 1406/1406 [05:20<00:00,  4.39it/s]


In [41]:
# queries['test-environment-aeghhgwpe-pro02a']

In [9]:
#### API CALL WARNING ####


# Generate embeddings for docs
for doc_id in tqdm(docs, desc = 'Generating Embeddings'):
    combined_text =  docs[doc_id]['text']
    docs[doc_id]['embedding'] = get_embedding(combined_text)

Generating Embeddings: 100%|███████████████████████████████████████| 2000/2000 [06:24<00:00,  5.20it/s]


In [42]:
docs_file_path = './backups/openai_embeddings/doc_embeddings_d1.pkl'
query_file_path = './backups/openai_embeddings/query_embeddings_d1.pkl'

with open(docs_file_path, 'wb') as file:
    pickle.dump(docs, file)

print(f"Embeddings saved to {docs_file_path}")

with open(query_file_path, 'wb') as file:
    pickle.dump(queries, file)

print(f"Embeddings saved to {query_file_path}")

Embeddings saved to ./backups/openai_embeddings/query_embeddings_d1.pkl


In [7]:
docs_file_path = './backups/openai_embeddings/doc_embeddings_d1.pkl'

with open(docs_file_path, 'rb') as file:
    docs = pickle.load(file)

print("Document embeddings loaded successfully.")

Document embeddings loaded successfully.


In [10]:
# Load the query embeddings from the file
query_file_path = './backups/openai_embeddings/query_embeddings_d1.pkl'
with open(query_file_path, 'rb') as file:
    loaded_queries = pickle.load(file)
    
queries = loaded_queries
print("Query embeddings loaded successfully.")

Query embeddings loaded successfully.


In [12]:
# queries['test-environment-aeghhgwpe-pro02a']

In [55]:
docs['test-environment-aeghhgwpe-pro02b']['text']

"You don’t have to be vegetarian to be green. Many special environments have been created by livestock farming – for example chalk down land in England and mountain pastures in many countries. Ending livestock farming would see these areas go back to woodland with a loss of many unique plants and animals. Growing crops can also be very bad for the planet, with fertilisers and pesticides polluting rivers, lakes and seas. Most tropical forests are now cut down for timber, or to allow oil palm trees to be grown in plantations, not to create space for meat production.  British farmer and former editor Simon Farrell also states: “Many vegans and vegetarians rely on one source from the U.N. calculation that livestock generates 18% of global carbon emissions, but this figure contains basic mistakes. It attributes all deforestation from ranching to cattle, rather than logging or development. It also muddles up one-off emissions from deforestation with on-going pollution.”  He also refutes the 

In [15]:
from langchain.vectorstores.utils import DistanceStrategy
annoy_data = []
for doc in docs:
    annoy_data.append((doc, docs[doc]["embedding"]))

faiss_vs = FAISS.from_embeddings(
    text_embeddings=annoy_data, 
    embedding=OpenAIEmbeddings(),
    distance_strategy=DistanceStrategy.DOT_PRODUCT)
faiss_vs.save_local("./backups/faiss/")

In [60]:
# Load index from file
loaded_faiss_vs = FAISS.load_local(
    folder_path="./backups/faiss/",
    embeddings=OpenAIEmbeddings())

retriever = loaded_faiss_vs.as_retriever(search_kwargs={'k': 10})

# Define the RAG pipeline
template = """
Given some additional context: {context} Answer the question: 
Question: {question} 
"""
prompt = ChatPromptTemplate.from_template(template)

In [61]:
def format_docs(_docs):
    ls = []
    count = 0
    for doc in _docs:
        if count > 5:
            break
        count+=1
        if doc.page_content in docs:
            ls.append(docs[doc.page_content]["text"])
        
    return ls
    # return [docs[int(doc.page_content)]["body"] for doc in _docs]

In [62]:
chain = ({"context": retriever | format_docs, "question": RunnablePassthrough()} 
         | prompt 
         | llm 
         | StrOutputParser())

In [63]:
# queries['test-environment-aeghhgwpe-pro02a']

In [64]:
chain.invoke(queries['test-environment-aeghhgwpe-pro02a']['text'])

'\nThe context provided argues that being vegetarian is beneficial for the environment, as meat production contributes to pollution, deforestation, and uses up excessive amounts of energy and water. It also suggests that becoming vegetarian is the only responsible way to eat in a world with a growing population.\n\nHowever, there are counterarguments that should be considered. For example, the context acknowledges that livestock farming can create unique environments, such as chalk down land in England and mountain pastures, which would be lost if farming ceased. Additionally, it argues that humans have a natural right to exploit lower species for food and that there are ways to produce meat without cruelty to animals. Furthermore, it raises concerns about the safety and hygiene of all food, not just meat, and suggests that locally sourced food can have a similar impact as vegetarianism.\n\nIt is important to consider all perspectives and evidence when evaluating the idea that being ve

In [65]:
#### API CALL WARNING #####

rag_responses = {}
loq = []
count  = 0
# Run RAG pipeline for every question
for query_id in tqdm(rel_set.keys(), desc = 'Asking Queries to ChatGPT with RAG'):
    if count > 100:
        break
    query_text = queries[query_id]['text']
    response = chain.invoke(query_text)
    rag_responses[query_id] = response
    count+=1

Asking Queries to ChatGPT with RAG:   7%|█▉                         | 101/1406 [02:23<30:58,  1.42s/it]


In [48]:
# print(queries[61])

In [66]:
#### DUMP OVERWRITE WARNING ####

rag_responses_file_path = './backups/openai_with_rag_responses_d1_' + model + '.pkl'
with open(rag_responses_file_path, 'wb') as file:
    pickle.dump(rag_responses, file)

print(f"RAG responses saved to {rag_responses_file_path}")

RAG responses saved to ./backups/openai_with_rag_responses_d1_gpt-3.5-turbo-instruct.pkl


In [67]:
openai_with_rag_responses_file_path = './backups/openai_with_rag_responses_d1_' + model + '.pkl'
with open(openai_with_rag_responses_file_path, 'rb') as file:
    rag_responses = pickle.load(file)

In [68]:
 # Sanity
rag_responses['test-environment-aeghhgwpe-pro02a']

'\nThe evidence presented in this context supports the argument that being vegetarian is beneficial for the environment. Eating meat and fish contributes to pollution, deforestation, and the depletion of resources like water. The production of meat also requires more energy and produces more greenhouse gas emissions compared to plant-based foods. In addition, the demand for meat drives deforestation, which further contributes to environmental harm. With a growing population, becoming vegetarian is seen as the responsible and sustainable way to eat. However, it is also important to consider the potential impact of a vegetarian diet on unique environments and the issue of food safety and hygiene. Ultimately, the decision to become vegetarian should be based on personal beliefs and values, but the evidence presented suggests that it can have positive effects on the environment.'

In [16]:
docs[0]['body']

".W\n   The present study is a history of the DEWEY Decimal\nClassification.  The first edition of the DDC was published\nin 1876, the eighteenth edition in 1971, and future editions\nwill continue to appear as needed.  In spite of the DDC's\nlong and healthy life, however, its full story has never\nbeen told.  There have been biographies of Dewey\nthat briefly describe his system, but this is the first\nattempt to provide a detailed history of the work that\nmore than any other has spurred the growth of\nlibrarianship in this country and abroad.\n"

In [73]:
# Implement BLEU evaluation function
def compute_bleu(references, candidate):
    smoothing = SmoothingFunction().method5
    return sentence_bleu(references, candidate, smoothing_function=smoothing)

# Implement ROUGE evaluation function
def compute_rouge(references, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    total_score = 0

    # Compute ROUGE for each reference
    for reference in references:
        scores = scorer.score(reference, candidate)
        total_score += scores['rouge1'].fmeasure

    # Calculate average score
    average_score = total_score / len(references)
    return average_score

# Evaluate BLEU and ROUGE for each query

K = 30 # Number of most relevant docs to consider for scoring performance
total_bleu_score = 0.0
total_rouge_score = 0.0
num_queries = 0

for query_id, relevant_docs in rel_set.items():
    query_text = queries[query_id]['text']
    response = rag_responses[query_id]

    # print(query_id, "\n\n", query_text, "\n\nResponse:\n", response, "\nTopmost relevant Doc:\n", docs[relevant_docs[0]]['text'], "\n======\n")
    
    # Evaluate using BLEU
    bleu_score = compute_bleu([docs[id]['text'] for id in relevant_docs[:K]], response)
    total_bleu_score += bleu_score

    # Evaluate using ROUGE
    rouge_score = compute_rouge([docs[id]['text'] for id in relevant_docs[:K]], response)
    total_rouge_score += rouge_score

    num_queries += 1
    if num_queries == 101:
        break

# Calculate mean scores
mean_bleu_score = total_bleu_score / num_queries
mean_rouge_score = total_rouge_score / num_queries

print(f"Mean BLEU Score: {mean_bleu_score:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score:.4f}")

Mean BLEU Score: 0.2906
Mean ROUGE Score: 0.2544


In [18]:
# da-vinci
# Mean BLEU Score: 0.8224
# Mean ROUGE Score: 0.2105

# Mean BLEU Score: 0.8377
# Mean ROUGE Score: 0.2226

# gpt-3.5-turbo-instruct

# Mean BLEU Score: 0.7869
# Mean ROUGE Score: 0.2407

# Mean BLEU Score: 0.7937
# Mean ROUGE Score: 0.2425
