In [1]:
import os
import openai
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [2]:
load_dotenv()
llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY")) # uses legacy da-vinci-003 model
client = openai.OpenAI()

In [3]:
def tokenizer(text):
    return word_tokenize(text.lower())
queries = {}

idx = None
with open("../data/cisi/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = ""
            elif not line.startswith("."):
                queries[idx] += " "+line

In [4]:
# queries

In [5]:
#### API CALL WARNING ####

answers_openai = {}
for idx, qry in tqdm(queries.items(), desc = 'Collecting ChatGPT Responses'):

    ### Uncomment to use text-davinci-003 Model ###
    
    # answers_openai[idx] = llm.invoke(qry)
    # model = 'text-davinci-003'

    ### Uncomment to use gpt-3.5-turbo-instruct Model ###
    answer = client.completions.create(
                              model="gpt-3.5-turbo-instruct",
                              prompt = "Answer this query or Explain the topic in about 150 words:" + qry,
                              max_tokens = 200
                          )
    answers_openai[idx] = answer.choices[0].text
    model = 'gpt-3.5-turbo-instruct'

Collecting ChatGPT Responses: 100%|██████████████████████████████████| 112/112 [06:35<00:00,  3.53s/it]


In [6]:
answers_openai[2]

"\n\nIn order to retrieve pertinent data automatically in response to information requests, it is crucial to have a well-organized and structured database. This database should contain all the relevant information in a standardized format, making it easier for a computer program or algorithm to search and retrieve specific data. The inclusion of metadata, such as keywords and tags, can also aid in the automatic retrieval of relevant data.\n\nAnother important step is to develop a robust search algorithm that is able to understand and interpret the user's information request. This involves training the algorithm using machine learning techniques, so it can accurately understand and interpret natural language queries.\n\nTo further enhance the automatic retrieval of pertinent data, it is also beneficial to have a system of user feedback. This allows the algorithm to learn and improve its understanding based on past user searches and selections.\n\nLastly, integration with advanced data m

In [7]:
#### DUMP OVERWRITE WARNING ####

openai_answers_file_path = './backups/answers_openai_'+model+'.pkl'

with open(openai_answers_file_path, 'wb') as file:
    pickle.dump(answers_openai, file)

print(f"Answers saved to {openai_answers_file_path}")

Answers saved to ./backups/answers_openai_gpt-3.5-turbo-instruct.pkl


In [8]:
docs_file_path = './backups/openai_embeddings/doc_embeddings.pkl'

with open(docs_file_path, 'rb') as file:
    docs = pickle.load(file)

print("Document embeddings loaded successfully.")

Document embeddings loaded successfully.


In [9]:
docs[1]['body']

'.W\nThis report is an analysis of 6300 acts of use\nin 104 technical libraries in the United Kingdom.\nLibrary use is only one aspect of the wider pattern of\ninformation use.  Information transfer in libraries is\nrestricted to the use of documents.  It takes no\naccount of documents used outside the library, still\nless of information transferred orally from person\nto person.  The library acts as a channel in only a\nproportion of the situations in which information is\ntransferred.\nTaking technical information transfer as a whole,\nthere is no doubt that this proportion is not the\nmajor one.  There are users of technical information -\nparticularly in technology rather than science -\nwho visit libraries rarely if at all, relying on desk\ncollections of handbooks, current periodicals and personal\ncontact with their colleagues and with people in other\norganizations.  Even regular library users also receive\ninformation in other ways.\n'

In [10]:
rel_set = {}
with open(os.path.join("../data/cisi/", 'CISI.REL')) as f:
    for l in f.readlines():
        qry_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0]
        doc_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1]

        if qry_id in rel_set:
            rel_set[qry_id].append(doc_id)
        else:
            rel_set[qry_id] = []
            rel_set[qry_id].append(doc_id)

In [11]:
openai_answers_file_path = './backups/answers_openai_'+model+'.pkl'
with open(openai_answers_file_path, 'rb') as file:
    answers_openai = pickle.load(file)

In [12]:
# Implement BLEU evaluation function
def compute_bleu(references, candidate):
    smoothing = SmoothingFunction().method0
    return sentence_bleu(references, candidate, smoothing_function=smoothing)

# Implement ROUGE evaluation function
def compute_rouge(references, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    total_score = 0

    # Compute ROUGE for each reference
    for reference in references:
        scores = scorer.score(reference, candidate)
        total_score += scores['rouge1'].fmeasure

    # Calculate average score
    average_score = total_score / len(references)
    return average_score

# Evaluate BLEU and ROUGE for each query

K = 30 # Number of most relevant docs to consider for scoring performance
total_bleu_score = 0.0
total_rouge_score = 0.0
num_queries = 0

for query_id, relevant_docs in tqdm(rel_set.items(), desc = 'Computing scores:'):
    query_text = queries[int(query_id)]
    response = answers_openai[int(query_id)]
    # print(query_id, query_text, "\n\nResponse:\n", response, "\n=========\nTopmost relevant Doc:\n", docs[int(relevant_docs[0])-1]['body'], "\n======\n")
    # print()
    
    # Evaluate using BLEU
    bleu_score = compute_bleu([docs[int(id)]['body'] for id in relevant_docs[:K]], response)
    total_bleu_score += bleu_score

    # Evaluate using ROUGE
    rouge_score = compute_rouge([docs[int(id)]['body'] for id in relevant_docs[:K]], response)
    total_rouge_score += rouge_score

    num_queries += 1
    
# Calculate mean scores
mean_bleu_score = total_bleu_score / num_queries
mean_rouge_score = total_rouge_score / num_queries

print(f"Mean BLEU Score: {mean_bleu_score:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score:.4f}")

Computing scores:: 100%|███████████████████████████████████████████████| 76/76 [00:06<00:00, 11.82it/s]

Mean BLEU Score: 0.8061
Mean ROUGE Score: 0.2482





In [13]:
# da-vinci
# Mean BLEU Score: 0.6094
# Mean ROUGE Score: 0.1603

# gpt-3.5-turbo-instruct
# Mean BLEU Score: 0.8061
# Mean ROUGE Score: 0.2482