# Experiments
### set up

In [5]:
! pip install -r requirements.txt

Collecting langchain (from -r requirements.txt (line 5))
  Downloading langchain-0.1.10-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.2/806.2 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting lancedb (from -r requirements.txt (line 7))
  Downloading lancedb-0.6.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rank_bm25 (from -r requirements.txt (line 8))
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting sentence-transformers (from -r requirements.txt (line 9))
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes (from -r requirements.txt (line 10))
  Downloading bitsandbytes-0.42.0-py3-none-any

In [6]:
import pandas as pd
import os

import lancedb
from langchain_community.vectorstores.lancedb import LanceDB
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.schema import Document

import transformers
from transformers import AutoTokenizer
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_core.documents.base import Document
from torch import cuda, bfloat16
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import RetrievalQA

import os
import urllib.request
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnablePick
from langchain_core.prompts import ChatPromptTemplate

In [10]:
# !rm -rf /app/db #remove db if something changed

In [8]:
# 1. Settings

path_to_data_csv = 'master_without_embeddings_all.csv'

path_to_database = '/app/db'

embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'

HF_AUTH = os.getenv('HF_AUTH', None)
os.environ['HF_HOME'] = os.getenv('HF_HOME', 'models')
model_id='llama-2-7b-chat.Q2_K.gguf' # on CPU, TheBloke/Llama-2-7B-GGUF

### parameters to be checked during evaluation:

chunk_size = 400 #used and changed in embed_langchain.ipynb
chunk_overlap = 50 #used and changed in embed_langchain.ipynb

retrieve_top_k_docs_bm25 = 1
retrieve_top_k_docs_vector =  1
context_length_for_llm = chunk_size*(retrieve_top_k_docs_bm25 + retrieve_top_k_docs_vector)+200 #not larger than 2048
retrievers_weights_bm25 = 0.4 #probability
llama_temperature = 0.75 #randomness parameter

In [12]:
# 2. Load the data
import csv
df = pd.read_csv(path_to_data_csv)

documents=[]
for index, row in df.iterrows():
    doc = Document(page_content = row['chunk'],
                   metadata={'id': row['id'], 'title': row['title'], 'authors': row['authors'], 'sources': row['sources']})
    documents.append(doc)

print(f'---\n--- Read {len(documents)} documents from {path_to_data_csv}')

---
--- Read 214381 documents from master_without_embeddings_all.csv


In [None]:
# 3. Create retrievers

print(f'---\n--- Creating retrievers...')

#bm25_retriever = BM25Retriever.from_documents(documents)
#bm25_retriever.k =  retrieve_top_k_docs_bm25

device = 'cuda' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

try:
    print("--- Trying to connect to LanceDB")
    db = lancedb.connect(path_to_database)
    table = db.open_table("chatmaja_test")
    docsearch = LanceDB(connection=table, embedding=embed_model)
    print("--- LanceDB found, connected successfully")
except:
    print("--- Error connecting to LanceDB, creating new one")
    db = lancedb.connect(path_to_database)
    table = db.create_table("chatmaja_test", data=[
            {"vector": embed_model.embed_query("Hello World"), "text": "Hello World", "id": "1", "authors": "authoors", "sources": "sourcees", "title": "tiitle"}
        ], mode="overwrite")
    print("--- LanceDB created and connected successfully")
    table.delete('authors = "authoors"')
    docsearch = LanceDB.from_documents(documents, embed_model, connection=table)
    print("--- Finished loading documents to LanceDB")



In [14]:
retriever_lancedb = docsearch.as_retriever(search_kwargs={"k": retrieve_top_k_docs_vector})
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k =  retrieve_top_k_docs_bm25

ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever_lancedb],
                                       weights=[retrievers_weights_bm25, 1-retrievers_weights_bm25])

print("---\n--- Created BM25 and vector search retrievers")



---
--- Created BM25 and vector search retrievers


In [10]:
# Create directory if it does not exist
os.makedirs(os.getenv('HF_HOME'), exist_ok=True)
device = 'cuda' if cuda.is_available() else 'cpu'
# Download model if not exists

path_to_model = os.path.join(os.getenv('HF_HOME'), model_id)
link_to_model = f"https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/{model_id}"

if not os.path.isfile(path_to_model):
    print(f"--- Downloading {model_id}...")
    urllib.request.urlretrieve(link_to_model, path_to_model)
    print(f"--- Downloaded {model_id} successfully.")
else:
    print(f"--- Model {model_id} already downloaded.")


# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
n_gpu_layers = -1 if device == 'cuda' else 0
llm = LlamaCpp(
    model_path=path_to_model,
    temperature=0,
    max_tokens=min(context_length_for_llm*2, 4096),
    n_gpu_layers=n_gpu_layers,
    n_ctx=min(context_length_for_llm, 2048), # increasing context makes computations longer
    top_p=1,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)


# 4b. Create pipeline

def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

# Prompt

rag_prompt_llama = ChatPromptTemplate.from_messages([
    ("human", """[INST]<<SYS>> You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise.<</SYS>> \nQuestion: {question} \nContext: {context} \nAnswer: [/INST]"""),
])


# Chain
chain = (
    RunnablePassthrough.assign(context=RunnablePick("context") | format_docs)
    | rag_prompt_llama
    | llm
    | StrOutputParser()
)

def answer_query(question: str) -> str:
    #print(f'- - - Question: {question}')
    docs = ensemble_retriever.get_relevant_documents(question)
    #print(f'- - - Relevant documents: {[d.page_content for d in docs]}')
    result = chain.invoke({"context": docs, "question": question})
    #print(f'- - - Results: {result}')
    answer =  result   #f"Query: {question}\n\nAnswer: {result}"
    return answer, docs

--- Model llama-2-7b-chat.Q2_K.gguf already downloaded.


llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from models/llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32           

## Example usage

In [40]:
# Usage

query = "What is the capital of Belgium?"
answer, docs = answer_query(query)

Llama.generate: prefix-match hit


  The capital of Belgium is Brussels.
In the context of capital depends on children and adolescents surviving, thriving, and learning until adulthood, investing in human capital is crucial for the development of enterprises. The main focus of enterprises is to improve personnel quality and enhance their core competitiveness by investing in human capital. However, with the development of market economy, the function of human resource market allocation has been improved, leading to an increase in investment risk of enterprise human capital. This can have a negative impact on enterprises, reducing their income from human capital investment and affecting their growth. Therefore, enterprises need to avoid or minimize the negative impact of human capital investment risk.


llama_print_timings:        load time =     135.85 ms
llama_print_timings:      sample time =     139.51 ms /   247 runs   (    0.56 ms per token,  1770.53 tokens per second)
llama_print_timings: prompt eval time =    3339.81 ms /   307 tokens (   10.88 ms per token,    91.92 tokens per second)
llama_print_timings:        eval time =    8755.76 ms /   246 runs   (   35.59 ms per token,    28.10 tokens per second)
llama_print_timings:       total time =   13443.97 ms /   553 tokens


In [19]:
docs[0].metadata['authors']

'Abdullah S||Rothenberg S||Siegel E||Kim W'

## Generate answers for the 50 questions set

In [20]:
def get_answers(questions):
  result_df = pd.DataFrame(columns = ['og_id', 'question', 'answer', 'found_id', 'context'])
  for i, row in questions.iterrows():
    new_row = {}
    new_row['og_id'] = row.id
    new_row['question'] = row.question
    answer, docs = answer_query(row.question);
    new_row['answer'] = answer
    new_row['found_id'] = [doc.metadata['id'] for doc in docs]
    new_row['context'] = [doc.page_content for doc in docs]
    result_df = result_df.append(new_row, ignore_index=True)

  return result_df

In [23]:
questions = pd.read_excel('questions1.xlsx')
results = get_answers(questions);

## Generate answers for confirmation questions

In [None]:
# generate answers without retrieved information for evaluation #


rag_prompt_llama_no_context = ChatPromptTemplate.from_messages([
    ("human", """[INST]<<SYS>> You are an assistant for question-answering tasks.
    If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise.<</SYS>> \nQuestion: {question}  \nAnswer: [/INST]"""),
])


# Chain
chain_no_context = (
    #RunnablePassthrough.assign(context=RunnablePick("context") | format_docs)
    rag_prompt_llama_no_context
    | llm
    | StrOutputParser()
)

def answer_query_no_context(question: str) -> str:
    #print(f'- - - Question: {question}')
    #docs = ensemble_retriever.get_relevant_documents(question)
    #print(f'- - - Relevant documents: {[d.page_content for d in docs]}')
    result = chain_no_context.invoke({"question": question})
    #print(f'- - - Results: {result}')
    answer =  result   #f"Query: {question}\n\nAnswer: {result}"
    return answer

In [None]:
def generate_yes_no(questions):
  result_df = pd.DataFrame(columns = ['question', 'answer_correct', 'answer_given', 'found_id', 'context'])
  for i, row in questions.iterrows():
    new_row = {}
    new_row['question'] = row.question
    new_row['answer_correct'] = row.answer
    answer, docs = answer_query(row.question);
    new_row['answer_given'] = answer
    new_row['found_id'] = [doc.metadata['id'] for doc in docs]
    new_row['context'] = [doc.page_content for doc in docs]
    new_row['no_context'] = answer_query_no_context(row.question)
    result_df = result_df.append(new_row, ignore_index=True)


  return result_df

In [None]:
confirmation_q = pd.read_excel('confirmation_questions_3.xlsx')
res = generate_yes_no(confirmation_q)
res.to_csv('results_confirmation.csv', index=False)

## Calucate hit rates

In [None]:
def calculate_chunk_hit_rates(questions, max_doc = 6):
  rates = {}
  for i in range(1,max_doc+1):
    rates[i]=0
  for i in range(1,max_doc+1):
    retriever_lancedb = docsearch.as_retriever(search_kwargs={"k": i})
    bm25_retriever = BM25Retriever.from_documents(documents)
    bm25_retriever.k =  i

    ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever_lancedb],
                                       weights=[retrievers_weights_bm25, 1-retrievers_weights_bm25])
    for j,row in questions.iterrows():
      docs = ensemble_retriever.get_relevant_documents(str(row.question))
      retrieved_ids = [doc.metadata['id'] for doc in docs]
      if row.id in retrieved_ids:
        rates[i]+=1
  for i in range(1,max_doc+1):
    rates[i]/=len(questions)

  return rates



In [None]:
def calculate_document_hit_rates(questions, max_doc = 5):
  rates = {}
  for i in range(1,max_doc+1):
    rates[i]=0
  for i in range(1,max_doc+1):
    retriever_lancedb = docsearch.as_retriever(search_kwargs={"k": i})
    bm25_retriever = BM25Retriever.from_documents(documents)
    bm25_retriever.k =  i

    ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever_lancedb],
                                       weights=[retrievers_weights_bm25, 1-retrievers_weights_bm25])
    for j,row in questions.iterrows():
      docs = ensemble_retriever.get_relevant_documents(str(row.question))
      retrieved_ids = [doc.metadata['id'].split("_")[0] for doc in docs]
      if row.id.split("_")[0] in retrieved_ids:
        rates[i]+=1
  for i in range(1,max_doc+1):
    rates[i]/=len(questions)

  return rates

In [None]:
questions = pd.read_excel('questions1.xlsx')
rates = calculate_chunk_hit_rates(questions)

In [None]:
document_rates = calculate_document_hit_rates(questions)