In [1]:
#necessary imports
import pymupdf #read pdf
from tqdm.auto import tqdm #progress bar
import pandas as pd #for analysis on sentences, words and token counts of pages
from spacy.lang.en import English #for sentence tokenising
from sentence_transformers import util, SentenceTransformer #for embedding text and queries
import random
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from config import num_sentence_chunk_size, embed_model_name, collection_name, pdf_path, initial_page, final_page, top_n_results,llm_model_name,  temperature, max_new_tokens, return_answer_only

### 1. Retrieval

#### 1.1 Select, Read and Pre-Process Document

In [2]:
def format_text(text):
    """
    Performs formatting operation on text, removes unncessary characters.

    Parameters:
        text (str): Text input to be formatted.

    Returns:
        str: A formatted string of the text provided.
    """
    clean_text = text.replace("\n", " ").strip()
    return clean_text

In [3]:
# Assumption -- 1 token = 4 chars
def read_pdf(pdf_path, initial_page, final_page):
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.
        initial_page (int): The first page number of the PDF to read
        final_page (int): The last page number of the PDF to read

    Returns:
        list[dict]: A list of dictionaries, each containing the page number,
        character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = pymupdf.open(pdf_path)
    text_pages = []
    for page_number, page in tqdm(enumerate(doc.pages(initial_page,final_page))):
        #Iterate over each page in the PDF
        #Gets the text in the page and stores various information as dict in a list
        text = page.get_text()
        text = format_text(text)
        text_pages.append({"page_number": page_number + 19,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return text_pages

In [4]:
#Provide path and page numbers to be extracted
# initial_page = 18
# final_page = 68
# pdf_path = r"E:\workstation\ragQuantiphi\docs\ConceptsofBiology.pdf"
pages_and_texts = read_pdf(pdf_path,initial_page,final_page)

0it [00:00, ?it/s]

In [5]:
pages_and_texts[0]

{'page_number': 19,
 'page_char_count': 1542,
 'page_word_count': 255,
 'page_sentence_count_raw': 10,
 'page_token_count': 385.5,
 'text': 'INTRODUCTION CHAPTER 1 Introduction to Biology 1.1 Themes and Concepts of Biology 1.2 The Process of Science Viewed from space, Earth (Figure 1.1) offers few clues about the diversity of life forms that reside there. The first forms of life on Earth are thought to have been microorganisms that existed for billions of years before plants and animals appeared. The mammals, birds, and flowers so familiar to us are all relatively recent, originating 130 to 200 million years ago. Humans have inhabited this planet for only the last 2.5 million years, and only in the last 300,000 years have humans started looking like we do today. 1.1 Themes and Concepts of Biology LEARNING OBJECTIVES By the end of this section, you will be able to: • Identify and describe the properties of life • Describe the levels of organization among living things • List examples of

In [6]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,19,1542,255,10,385.5,INTRODUCTION CHAPTER 1 Introduction to Biology...
1,20,2234,337,16,558.5,they do not meet the criteria that biologists ...
2,21,1988,306,16,497.0,FIGURE 1.3 The leaves of this sensitive plant ...
3,22,2092,309,14,523.0,"FIGURE 1.4 Although no two look alike, these k..."
4,23,1477,234,15,369.25,FIGURE 1.6 A lot of energy is required for a C...


In [7]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,50.0,50.0,50.0,50.0,50.0
mean,43.5,2630.72,420.78,22.96,657.68
std,14.57738,1067.538998,173.98373,12.610265,266.884749
min,19.0,213.0,41.0,1.0,53.25
25%,31.25,1969.25,303.0,15.25,492.3125
50%,43.5,2813.0,454.5,22.0,703.25
75%,55.75,3359.25,550.75,28.0,839.8125
max,68.0,4428.0,725.0,65.0,1107.0


In [8]:
nlp = English()
# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x11ee4357dd0>

In [9]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/50 [00:00<?, ?it/s]

In [10]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,50.0,50.0,50.0,50.0,50.0,50.0
mean,43.5,2630.72,420.78,22.96,657.68,22.38
std,14.58,1067.54,173.98,12.61,266.88,10.46
min,19.0,213.0,41.0,1.0,53.25,1.0
25%,31.25,1969.25,303.0,15.25,492.31,15.25
50%,43.5,2813.0,454.5,22.0,703.25,22.0
75%,55.75,3359.25,550.75,28.0,839.81,28.0
max,68.0,4428.0,725.0,65.0,1107.0,47.0


In [11]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 5 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list, slice_size):
    """
    Splits the input_list into sublists of size slice_size.

    Parameters:
    input_list (list): Input list to slice into sublists
    slice_size (int): Input slicing size

    Return:
    list[list[str]]: List of subsets of the complete input list
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/50 [00:00<?, ?it/s]

In [12]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,43.5,2630.72,420.78,22.96,657.68,22.38,4.82
std,14.58,1067.54,173.98,12.61,266.88,10.46,2.11
min,19.0,213.0,41.0,1.0,53.25,1.0,1.0
25%,31.25,1969.25,303.0,15.25,492.31,15.25,3.25
50%,43.5,2813.0,454.5,22.0,703.25,22.0,5.0
75%,55.75,3359.25,550.75,28.0,839.81,28.0,6.0
max,68.0,4428.0,725.0,65.0,1107.0,47.0,10.0


In [13]:
# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        #joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo  #delete
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/50 [00:00<?, ?it/s]

241

In [14]:
pages_and_chunks[0]

{'page_number': 19,
 'sentence_chunk': 'INTRODUCTION CHAPTER 1 Introduction to Biology 1.1 Themes and Concepts of Biology 1.2 The Process of Science Viewed from space, Earth (Figure 1.1) offers few clues about the diversity of life forms that reside there.The first forms of life on Earth are thought to have been microorganisms that existed for billions of years before plants and animals appeared.The mammals, birds, and flowers so familiar to us are all relatively recent, originating 130 to 200 million years ago.Humans have inhabited this planet for only the last 2.5 million years, and only in the last 300,000 years have humans started looking like we do today.1.1 Themes and Concepts of Biology LEARNING OBJECTIVES By the end of this section, you will be able to: • Identify and describe the properties of life • Describe the levels of organization among living things • List examples of different sub disciplines in biology Biology is the science that studies life.',
 'chunk_char_count': 93

In [15]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,241.0,241.0,241.0,241.0
mean,44.33,541.49,83.78,135.37
std,13.71,410.55,64.49,102.64
min,19.0,38.0,8.0,9.5
25%,33.0,387.0,60.0,96.75
50%,45.0,524.0,80.0,131.0
75%,56.0,633.0,98.0,158.25
max,68.0,4354.0,705.0,1088.5


In [16]:
df.loc[df['chunk_token_count'] > 300]

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
45,30,These cells have contributed to major medical ...,1247,199,311.75
87,38,Key Terms applied science a form of science th...,3867,578,966.75
223,65,Key Terms acid a substance that donates hydrog...,4354,705,1088.5
224,66,electrons oil an unsaturated fat that is a liq...,2851,455,712.75


#### 1.2 Embedding

In [17]:

device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer(model_name_or_path=embed_model_name, 
                                      device=device)
embedding_model.to("cuda")

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [18]:
pages_and_chunks[0]

{'page_number': 19,
 'sentence_chunk': 'INTRODUCTION CHAPTER 1 Introduction to Biology 1.1 Themes and Concepts of Biology 1.2 The Process of Science Viewed from space, Earth (Figure 1.1) offers few clues about the diversity of life forms that reside there.The first forms of life on Earth are thought to have been microorganisms that existed for billions of years before plants and animals appeared.The mammals, birds, and flowers so familiar to us are all relatively recent, originating 130 to 200 million years ago.Humans have inhabited this planet for only the last 2.5 million years, and only in the last 300,000 years have humans started looking like we do today.1.1 Themes and Concepts of Biology LEARNING OBJECTIVES By the end of this section, you will be able to: • Identify and describe the properties of life • Describe the levels of organization among living things • List examples of different sub disciplines in biology Biology is the science that studies life.',
 'chunk_char_count': 93

In [19]:
for item in tqdm(pages_and_chunks):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/241 [00:00<?, ?it/s]

In [22]:
# Turn text chunks into a single list
#text_chunks = [item["sentence_chunk"] for item in pages_and_chunks]

In [23]:
# %%time

# # Embed all texts in batches
# text_chunk_embeddings = embedding_model.encode(text_chunks,
#                                                batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
#                                                convert_to_tensor=True) # optional to return embeddings as tensor instead of array

# text_chunk_embeddings[0]

In [20]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [21]:
# Import saved file and view
text_chunks_and_embedding_df = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,19,INTRODUCTION CHAPTER 1 Introduction to Biology...,934,154,233.5,[ 2.10180786e-02 -6.17712848e-02 -7.68414419e-...
1,19,What exactly is life?This may sound like a sil...,546,86,136.5,[ 3.66447382e-02 -8.72069001e-02 1.25827771e-...
2,19,credit: modification of work by NASA) CHAPTER ...,53,8,13.25,[ 6.44325837e-02 -3.99337001e-02 -4.67345759e-...
3,20,they do not meet the criteria that biologists ...,493,74,123.25,[ 1.34523567e-02 -5.14974296e-02 6.96373824e-...
4,20,"As new organisms are discovered every day, bio...",574,74,143.5,[-2.09128428e-02 2.58600269e-03 -6.61942270e-...


In [23]:
# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

In [24]:
# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

In [25]:
# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
type(embeddings)

torch.Tensor

In [26]:
def retrieve_relevant_resources(query, embeddings, model=embedding_model, n_resources_to_return=5):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.

    Parameters:
    query (str): User input query
    embeddings (torch.Tensor): Embeddings of text chunks
    model (SentenceTransformer): Sentence Transformer model to use
    n_resources_to_return (int): No. of results to return

    Return:
    list[float]: list of top scores
    list[int]: list of index of top scores
    """

    # Embed the query
    query_embedding = model.encode(query, convert_to_tensor=True) 

    # Get dot product scores on embeddings
    dot_scores = util.dot_score(query_embedding, embeddings)[0]

    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

In [27]:
query = "What is Biology?"
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

(tensor([0.7186, 0.6939, 0.6394, 0.6321, 0.5974], device='cuda:0'),
 tensor([48, 89, 46, 47, 38], device='cuda:0'))

In [28]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name,trust_remote_code=True)
llm_model = AutoModelForCausalLM.from_pretrained(llm_model_name,torch_dtype=torch.bfloat16, device_map="auto",quantization_config=quantization_config, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
def prompt_formatter(query, context_items):
    """
    Combines query and context from retrieval method

    Parameters:
    query (str): The user query
    context_items (list[dict]): The context from retrieval method, top n answers

    Returns:
    str: Prompt for LLM model
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the context passages provided, answer the query.

Answer generation must follow below instructions:
1. Generate the answer by extracting relevant information from the context.
2. Don't return the thinking, only return the answer.
3. Make sure your answers are as explanatory as possible.

Now use the following context items to answer the user query:
{context}

User query: {query}
Answer:"""

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "system", "content": "You are an helful assistant to answer queries by finding information in few given passages. Answer the given query by going through passages or context items provided."},
        {"role": "user", "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [33]:
def ask(query, 
        temperature=0.7,
        max_new_tokens=512, 
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """
    
    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)
    
    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU 
        
    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    
    # Tokenize the prompt
    model_inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate an output of tokens
    outputs = llm_model.generate(model_inputs.input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens,
                                 pad_token_id=tokenizer.eos_token_id)

    output_answer = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, outputs)]
    
    # Turn the output tokens into text
    response = tokenizer.batch_decode(output_answer, skip_special_tokens=True)[0]

    # Only return the answer without the context items
    if return_answer_only:
        return response
    
    return response, context_items

In [34]:
print(f"Query: {query}")

# Answer query with context and return context 
answer, context_items = ask(query=query, 
                            temperature=temperature,
                            max_new_tokens=max_new_tokens,
                            return_answer_only=False)

print(f"Answer:\n")
print(answer)
print("\n\n")

Query: What is Biology?
Answer:

Biology is the study of life. It is a science that gathers knowledge about the natural world, specifically focusing on the discoveries of life forms and their interactions. Biology, like all sciences, is a social enterprise that requires careful observation, logical reasoning, experimentation, and sharing of conclusions under the scrutiny of others. It encompasses various fields, such as physiology, which studies the workings of cells, tissues, and organs. Biology's discoveries have significant practical implications affecting our health, food sources, and benefits from our ecosystem.





In [35]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(scores, indices):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")


Query: 'What is Biology?'

Results:
Score: 0.7186
Text:
Whatever its goal, there is no doubt that science, including biology, has transformed human existence and will continue to do so.FIGURE 1.15 Biologists may choose to study Escherichia coli (E. coli), a bacterium that is a normal resident of our digestive tracts but which is also sometimes responsible for disease outbreaks.In this micrograph, the bacterium is visualized using a scanning electron microscope and digital colorization. (credit: Eric Erbe; digital colorization by Christopher Pooley, USDA-ARS) The Nature of Science Biology is a science, but what exactly is science?What does the study of biology share with other scientific disciplines?
Page number: 31


Score: 0.6939
Text:
1.2 The Process of Science Biology is the science that studies living organisms and their interactions with one another and their environments.Science attempts to describe and understand the nature of the universe in whole or in part.Science has many fi