In [1]:
from langchain.embeddings import LlamaCppEmbeddings
from langchain.llms import LlamaCpp
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.vectorstores import FAISS
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from tqdm import tqdm

import pickle

In [2]:
llm = LlamaCpp(
    model_path="./backups/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_S.gguf.bin", 
    n_ctx=2048)
llm.client.verbose = False

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ./backups/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_S.gguf.bin (version unknown)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:           blk.0.attn_norm.weight f32      [

In [3]:
def tokenizer(text):
    return word_tokenize(text.lower())

In [4]:
queries = {}

idx = None
with open("../data/cisi/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = ""
            elif not line.startswith("."):
                queries[idx] += " "+line

In [5]:
# queries

In [11]:
answers_llama = {}
for idx, qry in tqdm(queries.items()):
    answers_llama[idx] = llm(qry)

  0%|                                                                | 0/112 [00:00<?, ?it/s]Llama.generate: prefix-match hit
  1%|▌                                                       | 1/112 [00:13<25:52, 13.98s/it]
llama_print_timings:        load time =  6919.55 ms
llama_print_timings:      sample time =   179.99 ms /   256 runs   (    0.70 ms per token,  1422.32 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 13430.61 ms /   256 runs   (   52.46 ms per token,    19.06 tokens per second)
llama_print_timings:       total time = 13979.15 ms
Llama.generate: prefix-match hit
  2%|█                                                       | 2/112 [00:29<27:07, 14.79s/it]
llama_print_timings:        load time =  6919.55 ms
llama_print_timings:      sample time =   180.58 ms /   256 runs   (    0.71 ms per token,  1417.65 tokens per second)
llama_print_timings: 

In [12]:
#### DUMP OVERWRITE WARNING ####

llama_answers_file_path = './backups/answers_llama.pkl'

with open(llama_answers_file_path, 'wb') as file:
    pickle.dump(answers_llama, file)

print(f"Answers saved to {llama_answers_file_path}")

Answers saved to ./backups/answers_llama.pkl


In [6]:
docs_file_path = './backups/openai_embeddings/doc_embeddings.pkl'

with open(docs_file_path, 'rb') as file:
    docs = pickle.load(file)

print("Document embeddings loaded successfully.")

Document embeddings loaded successfully.


In [7]:
docs[1]['body']

'.W\nThis report is an analysis of 6300 acts of use\nin 104 technical libraries in the United Kingdom.\nLibrary use is only one aspect of the wider pattern of\ninformation use.  Information transfer in libraries is\nrestricted to the use of documents.  It takes no\naccount of documents used outside the library, still\nless of information transferred orally from person\nto person.  The library acts as a channel in only a\nproportion of the situations in which information is\ntransferred.\nTaking technical information transfer as a whole,\nthere is no doubt that this proportion is not the\nmajor one.  There are users of technical information -\nparticularly in technology rather than science -\nwho visit libraries rarely if at all, relying on desk\ncollections of handbooks, current periodicals and personal\ncontact with their colleagues and with people in other\norganizations.  Even regular library users also receive\ninformation in other ways.\n'

In [8]:
rel_set = {}
with open("../data/cisi/CISI.REL") as f:
    for l in f.readlines():
        qry_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0]
        doc_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1]

        if qry_id in rel_set:
            rel_set[qry_id].append(doc_id)
        else:
            rel_set[qry_id] = []
            rel_set[qry_id].append(doc_id)

In [9]:
llama_answers_file_path = './backups/answers_llama.pkl'
with open(llama_answers_file_path, 'rb') as file:
    answers_llama = pickle.load(file)

In [10]:
# Implement BLEU evaluation function
def compute_bleu(references, candidate):
    smoothing = SmoothingFunction().method0
    return sentence_bleu(references, candidate, smoothing_function=smoothing)

In [11]:
# Implement ROUGE evaluation function
def compute_rouge(references, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    total_score = 0

    # Compute ROUGE for each reference
    for reference in references:
        scores = scorer.score(reference, candidate)
        total_score += scores['rouge1'].fmeasure

    # Calculate average score
    average_score = total_score / len(references)
    return average_score

In [13]:
# Evaluate BLEU and ROUGE for each query

K = 30 # Number of most relevant docs to consider for scoring performance
total_bleu_score = 0.0
total_rouge_score = 0.0
num_queries = 0

for query_id, relevant_docs in tqdm(rel_set.items(), desc = 'Computing scores:'):
    query_text = queries[int(query_id)]
    response = answers_llama[int(query_id)]
    # print(query_id, query_text, "\n\nResponse:\n", response, "\n=========\nTopmost relevant Doc:\n", docs[int(relevant_docs[0])-1]['body'], "\n======\n")
    # print()
    
    # Evaluate using BLEU
    bleu_score = compute_bleu([docs[int(id)]['body'] for id in relevant_docs[:K]], response)
    total_bleu_score += bleu_score

    # Evaluate using ROUGE
    rouge_score = compute_rouge([docs[int(id)]['body'] for id in relevant_docs[:K]], response)
    total_rouge_score += rouge_score

    num_queries += 1
    
# Calculate mean scores
mean_bleu_score = total_bleu_score / num_queries
mean_rouge_score = total_rouge_score / num_queries

print(f"Mean BLEU Score: {mean_bleu_score:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score:.4f}")

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Computing scores:: 100%|█████████████████████████████████████| 76/76 [00:04<00:00, 16.81it/s]

Mean BLEU Score: 0.7199
Mean ROUGE Score: 0.2061



