In [1]:
#necessary imports
import pymupdf #read pdf
from tqdm.auto import tqdm #progress bar
import pandas as pd #for analysis on sentences, words and token counts of pages
from spacy.lang.en import English #for sentence tokenising
from sentence_transformers import util, SentenceTransformer #for embedding text and queries
import random
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from config import num_sentence_chunk_size, embed_model_name, collection_name, pdf_path, initial_page, final_page, top_n_results,llm_model_name,  temperature, max_new_tokens, return_answer_only

### 1. Retrieval

#### 1.1 Select, Read and Pre-Process Document

In [2]:
def format_text(text):
    """
    Performs formatting operation on text, removes unncessary characters.

    Parameters:
        text (str): Text input to be formatted.

    Returns:
        str: A formatted string of the text provided.
    """
    clean_text = text.replace("\n", " ").strip()
    return clean_text

In [3]:
# Assumption -- 1 token = 4 chars
def read_pdf(pdf_path, initial_page, final_page):
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.
        initial_page (int): The first page number of the PDF to read
        final_page (int): The last page number of the PDF to read

    Returns:
        list[dict]: A list of dictionaries, each containing the page number,
        character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = pymupdf.open(pdf_path)
    text_pages = []
    for page_number, page in tqdm(enumerate(doc.pages(initial_page,final_page))):
        #Iterate over each page in the PDF
        #Gets the text in the page and stores various information as dict in a list
        text = page.get_text()
        text = format_text(text)
        text_pages.append({"page_number": page_number + 19,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return text_pages

In [4]:
#Provide path and page numbers to be extracted
# initial_page = initial_page
# final_page = 68
#pdf_path = r"E:\workstation\ragQuantiphi\docs\ConceptsofBiology.pdf"
pages_and_texts = read_pdf(pdf_path,initial_page,final_page)

0it [00:00, ?it/s]

In [5]:
pages_and_texts[0]

{'page_number': 19,
 'page_char_count': 1542,
 'page_word_count': 255,
 'page_sentence_count_raw': 10,
 'page_token_count': 385.5,
 'text': 'INTRODUCTION CHAPTER 1 Introduction to Biology 1.1 Themes and Concepts of Biology 1.2 The Process of Science Viewed from space, Earth (Figure 1.1) offers few clues about the diversity of life forms that reside there. The first forms of life on Earth are thought to have been microorganisms that existed for billions of years before plants and animals appeared. The mammals, birds, and flowers so familiar to us are all relatively recent, originating 130 to 200 million years ago. Humans have inhabited this planet for only the last 2.5 million years, and only in the last 300,000 years have humans started looking like we do today. 1.1 Themes and Concepts of Biology LEARNING OBJECTIVES By the end of this section, you will be able to: • Identify and describe the properties of life • Describe the levels of organization among living things • List examples of

In [6]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,19,1542,255,10,385.5,INTRODUCTION CHAPTER 1 Introduction to Biology...
1,20,2234,337,16,558.5,they do not meet the criteria that biologists ...
2,21,1988,306,16,497.0,FIGURE 1.3 The leaves of this sensitive plant ...
3,22,2092,309,14,523.0,"FIGURE 1.4 Although no two look alike, these k..."
4,23,1477,234,15,369.25,FIGURE 1.6 A lot of energy is required for a C...


In [7]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,50.0,50.0,50.0,50.0,50.0
mean,43.5,2630.72,420.78,22.96,657.68
std,14.57738,1067.538998,173.98373,12.610265,266.884749
min,19.0,213.0,41.0,1.0,53.25
25%,31.25,1969.25,303.0,15.25,492.3125
50%,43.5,2813.0,454.5,22.0,703.25
75%,55.75,3359.25,550.75,28.0,839.8125
max,68.0,4428.0,725.0,65.0,1107.0


In [8]:
nlp = English()
# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")
# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")

In [9]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/50 [00:00<?, ?it/s]

In [10]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,50.0,50.0,50.0,50.0,50.0,50.0
mean,43.5,2630.72,420.78,22.96,657.68,22.38
std,14.58,1067.54,173.98,12.61,266.88,10.46
min,19.0,213.0,41.0,1.0,53.25,1.0
25%,31.25,1969.25,303.0,15.25,492.31,15.25
50%,43.5,2813.0,454.5,22.0,703.25,22.0
75%,55.75,3359.25,550.75,28.0,839.81,28.0
max,68.0,4428.0,725.0,65.0,1107.0,47.0


In [11]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = num_sentence_chunk_size

# Create a function that recursively splits a list into desired sizes
def split_list(input_list, slice_size):
    """
    Splits the input_list into sublists of size slice_size.

    Parameters:
    input_list (list): Input list to slice into sublists
    slice_size (int): Input slicing size

    Return:
    list[list[str]]: List of subsets of the complete input list
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/50 [00:00<?, ?it/s]

In [12]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,43.5,2630.72,420.78,22.96,657.68,22.38,4.82
std,14.58,1067.54,173.98,12.61,266.88,10.46,2.11
min,19.0,213.0,41.0,1.0,53.25,1.0,1.0
25%,31.25,1969.25,303.0,15.25,492.31,15.25,3.25
50%,43.5,2813.0,454.5,22.0,703.25,22.0,5.0
75%,55.75,3359.25,550.75,28.0,839.81,28.0,6.0
max,68.0,4428.0,725.0,65.0,1107.0,47.0,10.0


In [13]:
# Split each chunk into its own item
documents = []
metadata = []
ids = []
doc_id = 1
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        #joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo  #delete
        documents.append(joined_sentence_chunk)
        metadata.append({"page_no":item["page_number"],
                         "chunk_char_count":len(joined_sentence_chunk),
                         "chunk_word_count":len([word for word in joined_sentence_chunk.split(" ")]),
                         "chunk_token_count":len(joined_sentence_chunk) / 4
                        })
        ids.append(str(doc_id))
        doc_id += 1


  0%|          | 0/50 [00:00<?, ?it/s]

In [14]:
#chromadb
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
chroma_client = chromadb.PersistentClient()

In [15]:
huggingface_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)

In [16]:
collection_name = collection_name
collection = chroma_client.get_or_create_collection(name=collection_name, embedding_function= huggingface_ef)

In [17]:
collection.add(
    documents=documents, # we embed for you, or bring your own
    metadatas=metadata, # filter on arbitrary metadata!
    ids=ids # must be unique for each doc 
)

Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embedding ID: 30
Add of existing emb

In [18]:
collection.count()

241

In [19]:
print("LLM model being used: ", llm_model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name,trust_remote_code=True)
llm_model = AutoModelForCausalLM.from_pretrained(llm_model_name,torch_dtype=torch.bfloat16, device_map="auto",quantization_config=quantization_config, trust_remote_code=True)

LLM model being used:  microsoft/Phi-3-mini-128k-instruct


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
def prompt_formatter(query, context_items):
    """
    Combines query and context from retrieval method

    Parameters:
    query (str): The user query
    context_items (list[dict]): The context from retrieval method, top n answers

    Returns:
    str: Prompt for LLM model
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["passage"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the context passages provided, answer the query.

Answer generation must follow below instructions:
1. Generate the answer by extracting relevant information from the context.
2. Make sure your answers are as explanatory as possible.

Now use the following context items to answer the user query:
{context}

User query: {query}
Answer:"""

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "system", "content": "You are an helful assistant to answer queries by finding information in few given passages. Answer the given query by going through passages or context items provided."},
        {"role": "user", "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [21]:
def ask(query,context, temperature = temperature, max_new_tokens = max_new_tokens, return_answer_only = return_answer_only):
        """
        Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.

        Parameters:
        query (str): User Query
        context (list[dict]): Contains previously retrieved top n results  from retrieval phase
        temperature (float): Temperature setting for LLM
        max_new_tokens (int): Maximum tokens generated during response
        return_answer_only (boolean): Return only answer  or with context and other metadata

        Returns:
        dict: Returns answer to the query
        dict: Returns metadata to the query
        """
        
        prompt = prompt_formatter(query=query,
                                context_items=context)
        
        # Tokenize the prompt
        model_inputs = tokenizer(prompt, return_tensors="pt").to(device)

        # Generate an output of tokens
        outputs = llm_model.generate(model_inputs.input_ids,
                                    temperature=temperature,
                                    do_sample=True,
                                    max_new_tokens=max_new_tokens,
                                    pad_token_id=tokenizer.eos_token_id)

        output_answer = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, outputs)]
        
        # Turn the output tokens into text
        response = tokenizer.batch_decode(output_answer, skip_special_tokens=True)[0]

        # Only return the answer without the context items
        if return_answer_only:
            return response
        
        return response, context

In [22]:
query = "What is biology"
print(f"Query: {query}")

Query: What is biology


In [23]:
results = collection.query(
    query_texts=query,
    n_results=5)

In [24]:
return_results = []
for page, passage in zip(results["metadatas"][0], results["documents"][0]):
    query_results = {}
    query_results["page_no"]= page["page_no"]
    query_results["passage"]= passage
    return_results.append(query_results)

In [25]:
# Answer query with context and return context 
answer, context_items = ask(query=query,
                            context = return_results,
                            temperature=temperature,
                            max_new_tokens=max_new_tokens,
                            return_answer_only=False)

#Printing Answer from LLM
print(f"Answer:\n")
print(answer)
print("\n\n")

#Printing relevant passages from the book fetched by retrival
print("Passages and Page numbers from Book")
for items in context_items:
    print("page_no: ", items["page_no"])
    print("passage: ",items["passage"])
    print("\n")
    

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


Answer:

Biology is the scientific study of life, encompassing various fields such as zoology (the study of animals) and botany (the study of plants). It investigates the physical, chemical, and biological processes that govern life, including how organisms interact with one another and their environments. Biology shares common methods with other scientific disciplines, such as careful observation, logical and mathematical reasoning, and experimentation. It also requires imagination and creativity, particularly in designing experiments, which are often described as elegant or beautiful. Biology has considerable practical implications, including the prevention of diseases, and is driven both by curiosity and practical applications. It also explores the history of life through the study of fossils, and within its broader framework, scientists can specialize in areas like biotechnology, ecology, and physiology.



Passages and Page numbers from Book
page_no:  39
passage:  1.2 The Process 