## Libraries

In [None]:
from datetime import datetime
from langchain_community.llms import Ollama
from langchain_community.document_loaders import WebBaseLoader
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
# from langchain_openai import OpenAIEmbeddings
from sentence_transformers import SentenceTransformer, util

# from utils.console_format import color
# import utils.prompt_templates as prompt_templates
from pathlib import Path
import time
import sys
import shutil
import ssl
import csv
import json
import hashlib
import os
import urllib3
import bs4

USER_AGENT environment variable not set, consider setting it to identify your requests.


## LLM Model and Similarity definition

In [None]:
SPECIAL = False
ONLINE = True

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
ssl._create_default_https_context = ssl._create_unverified_context

USE_PERSIST_DB = False

MODEL = 'deepseek-r1:8b'

CWD = os.getcwd()
print("[INFO] You're using " + MODEL + " model now")

time_str = datetime.now().strftime('%Y-%m-%d-%H%M%S')

CHROMA_PERSIST_DIR = "./chroma_db_" + time_str


ollama = Ollama(
    base_url='http://localhost:11434',
    model=MODEL,
    temperature=0,
)


[INFO] You're using deepseek-r1:8b model now


In [3]:
similarityModel = SentenceTransformer('all-MiniLM-L6-v2')

def semanticSimilarity(str1, str2):
    emb1 = similarityModel.encode(str1, convert_to_tensor=True)
    emb2 = similarityModel.encode(str2, convert_to_tensor=True)
    return util.pytorch_cos_sim(emb1, emb2).item()

ground_truth = "Martin plays volleyball. He performs well."
llm_output = "Andrei is not a good chess player"

print(semanticSimilarity(ground_truth, llm_output))

0.3297243118286133


## Loader

In [None]:
pdf_path = "pdfDocs/OfMiceAndMen.pdf"
loader = PyMuPDFLoader(pdf_path)

docs = [doc for doc in loader.load() if doc.page_content and isinstance(doc.page_content, str)]

  
  
 
 ...


## Embedder

In [None]:
oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="nomic-embed-text")

## Questions and Answers

In [7]:
QUESTIONS = [None] * 60
ANSWERS = [None] * 60

### Questions and Answers Chapter 1

In [8]:
QUESTIONS[0] = "When George and Lennie approach the river, why does George warn Lennie not to drink too much water?"

QUESTIONS[1] = "What has George told Lennie about that he always remembers even when he forgets everything else?"

QUESTIONS[2] = "Why does Lennie have a dead mouse in his pocket?"

QUESTIONS[3] = "Why does George order Lennie not to talk when they get to the ranch?"

QUESTIONS[4] = "What happened to all of the mice that Lennie's Aunt Clara gave him?"

QUESTIONS[5] = "Why have George and Lennie run away from Weed?"

QUESTIONS[6] = "What does Lennie want to eat with his beans?"

QUESTIONS[7] = "Why does George say that migrant workers who travel from farm to farm are the loneliest people in the world?"

QUESTIONS[8] = "What dream do George and Lennie share?"

QUESTIONS[9] = "What does George tell Lennie to do if he gets in trouble at their new job site?"


ANSWERS[0] = "George says Lennie will be sick like he was the night before."

ANSWERS[1] = "Lennie always remembers that he will be the one to tend the rabbits on their dream farm."

ANSWERS[2] = "He is carrying it in his pocket so he can pet it as they walk. He likes to pet soft things."

ANSWERS[3] = "George says that if the boss hears Lennie talk before he sees Lennie work, the two men won't have a chance of getting the job." 

ANSWERS[4] = "He killed the mice by petting them too hard."

ANSWERS[5] = "Lennie tried to feel a girl's dress. He wanted to pet the dress but she thought he was attacking her."

ANSWERS[6] = "Lennie wants ketchup to put on his beans."

ANSWERS[7] = "He says migrant workers are lonely because they don't have any family, they don't belong anywhere, and they have nothing to look forward to."

ANSWERS[8] = "They share the dream of buying a small farm together and working it. On this farm Lennie will tend the rabbits and pet them whenever he wants."

ANSWERS[9] = "George tells Lennie to come to this spot where they are camping and hide in the bushes until George comes for him."

### Questions and Answers Chapter 2

In [9]:
QUESTIONS[10] = "Where do the ranch hands keep their personal belongings such as soap, razors and magazines?"

QUESTIONS[11] = "Candy, the old swamper who shows George and Lennie to their bunks, is missing what limb?"

QUESTIONS[12] = "What evidence does the old swamper give that the ranch boss is a “pretty nice fella”?"

QUESTIONS[13] = "What evidence is there that the boss is not a working man?"

QUESTIONS[14] = "According to the old swamper, what is Curley good at?"

QUESTIONS[15] = "According to the old swamper, why does Curley wear a work glove on his left hand?"

QUESTIONS[16] = "What is the general attitude toward Curley's wife?"

QUESTIONS[17] = "Describe Slim, the jerkline skinner."

QUESTIONS[18] = "Why does Carlson suggest shooting Candy's dog?"

QUESTIONS[19] = "What is the understood question that Lennie wants George to ask Slim?"

ANSWERS[10] = "Each ranch hand keeps his personal items in the apple box nailed over his bunk for that purpose."

ANSWERS[11] = "Candy, the old swamper, is missing a hand."

ANSWERS[12] = "Candy says that the boss brought a whole gallon of whiskey to the men in the bunkhouse for Christmas."

ANSWERS[13] = "The boss wears high-heeled boots and spurs."

ANSWERS[14] = "Candy says Curley is good at boxing."

ANSWERS[15] = "Candy says Curley wears the work glove full of Vaseline to keep his hand soft for his new wife."

ANSWERS[16] = "The men think she is flirting with them. Candy calls her a tart; George calls her a tramp. Lennie thinks she is pretty."

ANSWERS[17] = "Slim is a master craftsman. He is an expert with the mules and his authority is respected more than anyone else's on the ranch."

ANSWERS[18] = "Carlson suggests shooting Candy's dog because it is so old and it stinks."

ANSWERS[19] = "Lennie wants George to ask Slim if Lennie can have one of the puppies Slim's dog has just delivered."

### Questions and Answers Chapter 3

In [10]:
QUESTIONS[20] = "Why does George say Lennie will want to sleep in the barn that Friday night?"

QUESTIONS[21] = "According to George, how did he end up traveling with Lennie?"

QUESTIONS[22] = "What happened that made George stop playing dirty tricks on Lennie?"

QUESTIONS[23] = "Why did George and Lennie have to flee from Weed?"

QUESTIONS[24] = "Who makes the final decision on whether or not Candy's old dog should be shot?"

QUESTIONS[25] = "What is significant about the letter Whit reads from the Western magazine?"

QUESTIONS[26] = "Why does George agree to let Candy come with them to their dream farm?"

QUESTIONS[27] = "Why does Curley attack Lennie in the bunk house?"

QUESTIONS[28] = "Why does Curley agree not to get Lennie fired for crushing his hand?"

QUESTIONS[29] = "What punishment does Lennie fear he will get for hurting Curley?"


ANSWERS[20] = "George says Lennie will want to sleep with the puppy Slim has said Lennie can have when it is weaned."

ANSWERS[21] = "George says that he and Lennie are both from Auburn and that he knew Lennie's Aunt Clara who raised him. He says that when the aunt died Lennie had just come along with him to work."

ANSWERS[22] = "The last time George played a trick on Lennie, he told Lennie to jump into a river and Lennie did even though he couldn’t swim. Before George got him out, he almost drowned. Lennie, however, was thankful to George for getting him out instead of angry for telling him to jump in."

ANSWERS[23] = "George says that he and Lennie had to flee from Weed because Lennie was accused of trying to rape a girl there. In fact, he had only been trying to feel the dress she was wearing."

ANSWERS[24] = "Slim is the one who makes the final decision."

ANSWERS[25] = "The letter was written by a former ranch hand they had known."

ANSWERS[26] = "Candy offers to give George $350, his life's savings, if they will let him come along. With his money they should be able to buy the farm at the end of the next month so George agrees to let him in on their dream."

ANSWERS[27] = "Curley attacks Lennie because he thinks Lennie is laughing at him after Carlson has called him “yella as a frog belly.” In fact, Lennie is smiling at the idea in his head of their farm."

ANSWERS[28] = "Slim convinces Curley that if he tells, everyone will laugh at him for getting beaten up by a retarded man."

ANSWERS[29] = "George has told Lennie that he will not let Lennie tend the rabbits if he does one more bad thing. Lennie is afraid this will be that bad thing."

### Questions and Answers Chapter 4

In [11]:
QUESTIONS[30] = "Why has Crooks been able to accumulate more personal items than the other ranch hands?"

QUESTIONS[31] = "What reason does Crooks first give for Lennie not being welcome in his room?"

QUESTIONS[32] = "According to Crooks, why does a person need a companion?"

QUESTIONS[33] = "What is Crooks's initial response to Candy's account of the dream farm and what evidence is there that his attitude changes?"

QUESTIONS[34] = "According to Curley's wife, why are the men afraid to talk to her when there is more than one present?"

QUESTIONS[35] = "Why doesn't Curley's wife like talking to her husband?"

QUESTIONS[36] = "What reason does Candy give when he says that they are no longer afraid that Curley's wife will get them fired?"

QUESTIONS[37] = "What makes Crooks so bold as to confront Curley's wife and tell her to leave his room?"

QUESTIONS[38] = "How does Candy finally make Curley's wife leave the barn?"

QUESTIONS[39] = "What does George say about Candy and Lennie visiting with Crooks?"

ANSWERS[30] = "Because of the type of job he has and because Crooks is crippled, he is more permanent than the other men, so he can accumulate personal items without having to worry about how he will carry them with him to the next job."

ANSWERS[31] = "Crooks says at first that Lennie is not welcome in his room because Crooks is not welcome in the bunkhouse."

ANSWERS[32] = "Crooks says that a person who stays alone too long goes “nuts.”"

ANSWERS[33] = "Crooks says that the dream will never materialize. He says he has seen hundreds of men chasing the same dream and never catching it. But when he hears that they have the money for the farm in the bank, he becomes more convinced and even offers to work for free if they will let him come with them."

ANSWERS[34] = "Curley's wife says that the men are “scared of each other... scared the rest is going to get something on you.”"

ANSWERS[35] = "Curley's wife doesn't like talking to her husband because all he ever wants to talk about is beating up people."

ANSWERS[36] = "Candy explains that they are no longer afraid because they now have somewhere else to go—their own farm."

ANSWERS[37] = "He forgets his own limitations as a black man of the 1930s because Lennie and Candy have come in and treated him as an equal. For a moment, he later explains, he forgot how powerless he really is there."

ANSWERS[38] = "Candy gets Curley's wife to leave the barn by telling her that he has heard the other men returning from town."

ANSWERS[39] = "George tells them that they should not be in Crooks's room and that they should not have told him about the farm."

### Questions and Answers Chapter 5

In [12]:

QUESTIONS[40] = "What has happened to Lennie's puppy and why?"

QUESTIONS[41] = "What two pieces of information does Curley's wife share with Lennie?"

QUESTIONS[42] = "Why does Curley's wife offer to let Lennie caress her hair?"

QUESTIONS[43] = "How and why does Lennie kill Curley's wife?"

QUESTIONS[44] = "Why does George say that they can't let Lennie escape to live on his own?"

QUESTIONS[45] = "What is Candy's greatest fear?"

QUESTIONS[46] = "When George asks Slim about just trying to catch Lennie instead of killing him, what advice does Slim give George?"

QUESTIONS[47] = "What makes the men think that Lennie is armed?"

QUESTIONS[48] = "Where does Curley plan to aim if he shoots Lennie?"

QUESTIONS[49] = "Who stays with Curley's wife as the others go off in pursuit of Lennie?"


ANSWERS[40] = "Lennie has killed his puppy by bouncing it too hard."

ANSWERS[41] = "Curley's wife tells him about her dream to be an actress, and she tells him her secret that she does not like Curley."

ANSWERS[42] = "Curley's wife says that she shares Lennie's fondness of soft things and since she regards him as “a big baby,” she sees no harm in letting him feel the softness of her hair."

ANSWERS[43] = "Lennie kills Curley's wife by breaking her neck because he is shaking her, trying to make her be quiet so he won't get into trouble."

ANSWERS[44] = "George says that Lennie will starve out on his own."

ANSWERS[45] = "Candy's greatest fear is that they will not get the farm."

ANSWERS[46] = "Slim tells George that if they just catch Lennie, he would be strapped down and caged, which would be worse than death."

ANSWERS[47] = "The men think that Lennie is armed because Carlson comes into the barn and announces that his gun is missing."

ANSWERS[48] = "Curley is planning to shoot Lennie in the stomach."

ANSWERS[49] = "Candy stays with Curley's wife."


### Questions and Answers Chapter 6

In [13]:
QUESTIONS[50] = "What scenes of death does Steinbeck describe in the beginning of Chapter 6 that parallel the events of the previous chapter and foreshadow the event to come?"

QUESTIONS[51] = "How does the chapter bring the book full circle?"

QUESTIONS[52] = "What two imaginary visitors does Lennie have while sitting on the river bank?"

QUESTIONS[53] = "What is the subject of the conversation Lennie has with his first visitor?"

QUESTIONS[54] = "What does his second visitor tell Lennie that recalls an earlier conversation he had with Crooks?"

QUESTIONS[55] = "How is George and Lennie's conversation similar to the one that they had by the pool in Chapter 1?"

QUESTIONS[56] = "Where has George gotten the gun he takes from his front pocket while sitting with Lennie on the river bank?"

QUESTIONS[57] = "What evidence is there that George is having a terribly difficult time bringing himself to shoot Lennie?"

QUESTIONS[58] = "What lie does George tell about the way Lennie died?"

QUESTIONS[59] = "What evidence is there that Slim understands what has really happened there on the river bank?"


ANSWERS[50] = "A water snake gliding in the pool is caught by a heron and eaten while its tail waves frantically, and a strong wind blows into the clearing and dies down."

ANSWERS[51] = "The book begins and ends at the pool by the clearing."

ANSWERS[52] = "While sitting by the clearing Lennie is visited by a hallucination of his Aunt Clara and of a gigantic rabbit."

ANSWERS[53] = "Aunt Clara accuses Lennie of doing bad things. She tells him how George is always doing nice things for Lennie and taking care of him."

ANSWERS[54] = "The rabbit tells Lennie that George isn't going to let Lennie tend the rabbits and that he's going to beat him with a stick. Like Crooks, the gigantic rabbit says that George is going to leave Lennie and never come back."

ANSWERS[55] = "As in the first chapter, George tells Lennie how easy his life would be if he was alone. And Lennie tells George that he will run off to the hills and find a cave to live in by himself."

ANSWERS[56] = "George has taken the gun he has from Carlson's bunk."

ANSWERS[57] = "The first time George raises the gun to the back of Lennie's head, he can't pull the trigger and lays the gun down again. The second time, when he does fire the gun, his hand is shaking violently."

ANSWERS[58] = "George lets the men believe that he took the gun from Lennie and then shot him in the same attitude as they would have."

ANSWERS[59] = "Slim shows that he understands what George has done as he consoles George and tells him that he has only done what he had to do."



## Generation

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer,try to infer it "
    "Use at most 1 sentence maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(ollama, prompt)


# BENCHMARKS

## Chunking modes and k-most relevant chunks

In [None]:
ChunkSizes = [300, 500, 1000, 2000, 4000]
ChunkOverlaps = [50, 100, 200, 300, 1000]
kk = [2, 6, 10, 25]

In [None]:
n = 10 #nr of questions to answer

for i in range (0,5):    
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=ChunkSizes[i], chunk_overlap=ChunkOverlaps[i], add_start_index=True
    )
    all_splits = text_splitter.split_documents(docs)

    vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)

    for k in range (0, 5):
        retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": kk[k]})
        rag_chain = create_retrieval_chain(retriever, question_answer_chain)  # Context is filled by the retriever
        
        
        similarityScore = 0
        for j in range (0,n):
            response = rag_chain.invoke({"input": QUESTIONS[j]})
            similarityScore += semanticSimilarity(response["answer"], ANSWERS[j])
            # print(i," :Done with ", j)

        print("for chunk size: ", ChunkSizes[i], ", chunk overlap: ", ChunkOverlaps[i],", number of relevant chunks: ", kk[k], " the similarity is: " , format(similarityScore / n, ".3f"))


for chunk size: 300, chunk overlap: 50, number of relevant chunks: 2 the similarity is: 0.578
for chunk size: 300, chunk overlap: 50, number of relevant chunks: 6 the similarity is: 0.570
for chunk size: 300, chunk overlap: 50, number of relevant chunks: 10 the similarity is: 0.601
for chunk size: 300, chunk overlap: 50, number of relevant chunks: 25 the similarity is: 0.581
for chunk size: 500, chunk overlap: 100, number of relevant chunks: 2 the similarity is: 0.546
for chunk size: 500, chunk overlap: 100, number of relevant chunks: 6 the similarity is: 0.567
for chunk size: 500, chunk overlap: 100, number of relevant chunks: 10 the similarity is: 0.600
for chunk size: 500, chunk overlap: 100, number of relevant chunks: 25 the similarity is: 0.601
for chunk size: 1000, chunk overlap: 200, number of relevant chunks: 2 the similarity is: 0.546
for chunk size: 1000, chunk overlap: 200, number of relevant chunks: 6 the similarity is: 0.557
for chunk size: 1000, chunk overlap: 200, number

## CLEANUP

In [None]:
vectorstore.delete_collection()