In [1]:
import pandas as pd
pd.set_option('mode.chained_assignment', None)
from question import get_answer, top_k_article
from langchain_core.messages import HumanMessage
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from conversation import get_conversation_chain
from sentence_transformers import SentenceTransformer
from utils import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embedding_model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

In [3]:
df = pd.read_csv('gen_test.csv')
df

Unnamed: 0,id,contexts,questions,answers
0,0,PARIS When the Islamic State was about to be d...,Who got a call from Syria's director?,Yves Ubelmann got the call from the Director o...
1,1,PARIS When the Islamic State was about to be d...,How old is he?,Yves Ubelmann is 36 years old
2,2,engulfed by war. But now there was special urg...,Who destroyed Palmyra?,The Islamists destroyed Palmyra
3,3,engulfed by war. But now there was special urg...,What kind of weapon did they use?,The Islamists destroyed Palmyra using explosives
4,4,was still rigged with explosives. So he and Ho...,Who founded Iconem company?,Yves Ubelmann founded the company Iconem
5,5,was still rigged with explosives. So he and Ho...,What did he say about drones?,He said the drones with four or six rotors can...
6,6,terrorists were uploading videos with them blo...,How many picture did he take at Palmyra?,Yves Ubelmann and his team took 40000 images a...
7,7,to the rising threats to global heritage. To u...,Who is the director of the Lourve?,Martinez is the director of the Lourve
8,8,Mr. Ubelmann dismissed any criticism of collab...,What did Mr. Ubelmann dismiss?,Mr. Ubelmann dismissed that they worked for th...
9,9,can later do this on their own. What is paramo...,How many site has his team flown drones over i...,Mr. Ubelmann's team has flown drones over some...


In [4]:
def get_context(query):
    if query in df['questions'].values:
        return df[df['questions'] == query]['contexts'].values[0]

In [7]:
class CustomRetriever(BaseRetriever):

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ):

        return get_context(query)

retriever = CustomRetriever()


In [8]:
rag_chain = get_conversation_chain(retriever, model_name='gpt-3.5-turbo')

chat_history = []

def handle_qa(query):
    ai_msg = rag_chain.invoke({"question": query, "chat_history": chat_history})
    if len(chat_history) >= 10:
        chat_history.pop(0)
        chat_history.pop(0)
    chat_history.extend([HumanMessage(content=query), ai_msg])
    return chat_history

preds = []
cosin_s_scores = []
for i, row in df.iterrows():
    answer = handle_qa(row[2])
    preds.append(answer[-1].content)
    
    cosin_s_score = cosine_similarity(answer[-1].content, row[3], embedding_model)
    cosin_s_scores.append(cosin_s_score[0])
    
df['preds'] = preds
df['cosin_s_scores'] = cosin_s_scores
df

  answer = handle_qa(row[2])
  cosin_s_score = cosine_similarity(answer[-1].content, row[3], embedding_model)
  answer = handle_qa(row[2])
  cosin_s_score = cosine_similarity(answer[-1].content, row[3], embedding_model)
  answer = handle_qa(row[2])
  cosin_s_score = cosine_similarity(answer[-1].content, row[3], embedding_model)
  answer = handle_qa(row[2])
  cosin_s_score = cosine_similarity(answer[-1].content, row[3], embedding_model)
  answer = handle_qa(row[2])
  cosin_s_score = cosine_similarity(answer[-1].content, row[3], embedding_model)
  answer = handle_qa(row[2])
  cosin_s_score = cosine_similarity(answer[-1].content, row[3], embedding_model)
  answer = handle_qa(row[2])
  cosin_s_score = cosine_similarity(answer[-1].content, row[3], embedding_model)
  answer = handle_qa(row[2])
  cosin_s_score = cosine_similarity(answer[-1].content, row[3], embedding_model)
  answer = handle_qa(row[2])
  cosin_s_score = cosine_similarity(answer[-1].content, row[3], embedding_model)
  answer =

Unnamed: 0,id,contexts,questions,answers,preds,cosin_s_scores
0,0,PARIS When the Islamic State was about to be d...,Who got a call from Syria's director?,Yves Ubelmann got the call from the Director o...,Yves Ubelmann got a call from Syria's director.,0.855694
1,1,PARIS When the Islamic State was about to be d...,How old is he?,Yves Ubelmann is 36 years old,I don't know.,0.095257
2,2,engulfed by war. But now there was special urg...,Who destroyed Palmyra?,The Islamists destroyed Palmyra,ISIS destroyed Palmyra.,0.899902
3,3,engulfed by war. But now there was special urg...,What kind of weapon did they use?,The Islamists destroyed Palmyra using explosives,They used explosives to destroy Palmyra.,0.862068
4,4,was still rigged with explosives. So he and Ho...,Who founded Iconem company?,Yves Ubelmann founded the company Iconem,Yves Ubelmann founded Iconem company.,0.981132
5,5,was still rigged with explosives. So he and Ho...,What did he say about drones?,He said the drones with four or six rotors can...,He said that drones are a great tool for captu...,0.53911
6,6,terrorists were uploading videos with them blo...,How many picture did he take at Palmyra?,Yves Ubelmann and his team took 40000 images a...,"He took around 20,000 pictures at Palmyra.",0.792103
7,7,to the rising threats to global heritage. To u...,Who is the director of the Lourve?,Martinez is the director of the Lourve,The director of the Louvre is Jean-Luc Martinez.,0.688218
8,8,Mr. Ubelmann dismissed any criticism of collab...,What did Mr. Ubelmann dismiss?,Mr. Ubelmann dismissed that they worked for th...,Mr. Ubelmann dismissed any criticism of collab...,0.889039
9,9,can later do this on their own. What is paramo...,How many site has his team flown drones over i...,Mr. Ubelmann's team has flown drones over some...,His team has flown drones over 20 sites in Syria.,0.832916


In [9]:
false_df = df[df['cosin_s_scores'] < 0.5]
true_df = df[df['cosin_s_scores'] >= 0.5]

TP = len(true_df[true_df['answers'] != "I don't know"])
TN = len(true_df[true_df['answers'] == "I don't know"])
FP = len(false_df[false_df['answers'] != "I don't know"])
FN = len(false_df[false_df['answers'] == "I don't know"])

accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)
# GPT 3.5 Turbo
print("Accuracy: ", round(accuracy, 2))
print("Precision: ", round(precision, 2))
print("Recall: ", round(recall, 2))
print("F1 Score: ", round(f1_score, 2))

Accuracy:  0.8
Precision:  0.79
Recall:  1.0
F1 Score:  0.88


In [21]:
rag_chain = get_conversation_chain(retriever, model_name='flan-t5-base')

chat_history = []

def handle_qa(query):
    ai_msg = rag_chain.invoke({"question": query, "chat_history": chat_history})
    if len(chat_history) >= 10:
        chat_history.pop(0)
        chat_history.pop(0)
    chat_history.extend([HumanMessage(content=query), ai_msg])
    return chat_history

preds = []
cosin_s_scores = []
for i, row in df.iterrows():
    answer = handle_qa(row[2])
    preds.append(answer[-1])
    
    cosin_s_score = cosine_similarity(answer[-1], row[3], embedding_model)
    cosin_s_scores.append(cosin_s_score[0])
    
df['preds'] = preds
df['cosin_s_scores'] = cosin_s_scores
df

  answer = handle_qa(row[2])


HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://api-inference.huggingface.co/models/google/flan-t5-base (Request ID: nxYD9wEnsYkVbSjf8iZob)

Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate

In [15]:
false_df = df[df['cosin_s_scores'] < 0.5]
true_df = df[df['cosin_s_scores'] >= 0.5]

TP = len(true_df[true_df['answers'] != "I don't know"])
TN = len(true_df[true_df['answers'] == "I don't know"])
FP = len(false_df[false_df['answers'] != "I don't know"])
FN = len(false_df[false_df['answers'] == "I don't know"])

accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)
# FLAN T5 Base
print("Accuracy: ", round(accuracy, 2))
print("Precision: ", round(precision, 2))
print("Recall: ", round(recall, 2))
print("F1 Score: ", round(f1_score, 2))

Accuracy:  0.06
Precision:  0.06
Recall:  0.5
F1 Score:  0.11


In [20]:
rag_chain = get_conversation_chain(retriever, model_name='James449/nlp-t5-qa-model')

chat_history = []

def handle_qa(query):
    ai_msg = rag_chain.invoke({"question": query, "chat_history": chat_history})
    if len(chat_history) >= 10:
        chat_history.pop(0)
        chat_history.pop(0)
    chat_history.extend([HumanMessage(content=query), ai_msg])
    return chat_history

preds = []
cosin_s_scores = []
for i, row in df.iterrows():
    answer = handle_qa(row[2])
    preds.append(answer[-1])
    
    cosin_s_score = cosine_similarity(answer[-1], row[3], embedding_model)
    cosin_s_scores.append(cosin_s_score[0])
    
df['preds'] = preds
df['cosin_s_scores'] = cosin_s_scores
df

  answer = handle_qa(row[2])


HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://api-inference.huggingface.co/models/James449/nlp-t5-qa-model (Request ID: E-iJZfOdfFfZ9cxJCZxT1)

Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate

In [None]:
false_df = df[df['cosin_s_scores'] < 0.5]
true_df = df[df['cosin_s_scores'] >= 0.5]

TP = len(true_df[true_df['answers'] != "I don't know"])
TN = len(true_df[true_df['answers'] == "I don't know"])
FP = len(false_df[false_df['answers'] != "I don't know"])
FN = len(false_df[false_df['answers'] == "I don't know"])

accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)
# FLAN T5 Base Finetuned
print("Accuracy: ", round(accuracy, 2))
print("Precision: ", round(precision, 2))
print("Recall: ", round(recall, 2))
print("F1 Score: ", round(f1_score, 2))