In [1]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.11


In [2]:
!pip install spacy



In [3]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.1


In [4]:
import os
import requests

In [6]:
import fitz
from tqdm.auto import tqdm

def extract_text_from_pdf(pdf_path:str) -> list[dict]:
    doc = fitz.open(pdf_path)
    page_list = []
    for page_no,page in tqdm(enumerate(doc)):
        text = page.get_text()
        # clear text
        text = text.replace('\n', '').strip()

        page_list.append({
            'page_no': page_no,
            'text': text,
            'char_count': len(text),
            'token_count':len(text)/4
        })

    return page_list


In [7]:
from spacy.lang.en import English
nlp = English()
nlp.add_pipe('sentencizer')

# chunking
def make_chunks(page_list:list[dict], chunk_size:int = 10,overlap_sentences:int = 2,min_chunk_size:int = 100) -> list[dict]:
    # text to sentences
    for page in tqdm(page_list):
        page['sentences'] = []
        for sent in nlp(page['text']).sents:
            page['sentences'].append(sent.text)
        page['sentence_count'] = len(page['sentences'])

    chunk_list = []
    # sentences to seperate chunks of max chunk_size
    for page in tqdm(page_list):
        page['chunks'] = []
        for i in range(0, page['sentence_count'], chunk_size-overlap_sentences):
            page['chunks'].append(page['sentences'][i:i+chunk_size])
        # make a chunk a special dict for further operations based on each chunk rather than a page
        for chunk in page['chunks']:
            chunk_item = {
                'page_no': page['page_no'],
                'text': ' '.join(chunk),
                'sentence_count': len(chunk),
            }
            chunk_item['char_count'] = len(chunk_item['text'])
            if chunk_item['char_count'] < min_chunk_size:
                continue
            chunk_item['token_count'] = len(chunk_item['text'])/4
            chunk_list.append(chunk_item)

    return chunk_list


In [8]:
# create embeddings using model all-mpnet-base-v2
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('all-mpnet-base-v2',device=device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [60]:
def create_embeddings(chunk_list:list[dict]) -> list[dict]:
    # leverage GPU
    chunk_texts = [chunk['text'] for chunk in chunk_list]
    chunk_embeddings = model.encode(chunk_texts,batch_size=64,convert_to_tensor=False)
    for i,chunk in tqdm(enumerate(chunk_list)):
        chunk['embedding'] = chunk_embeddings[i]

    return chunk_list

In [61]:
import pandas as pd
import numpy as np

def save_embeddings(chunk_list:list[dict],save_path:str = 'embeddings.csv') -> None:
    df = pd.DataFrame(chunk_list)
    # df['embedding'] = df['embedding'].apply(lambda x: x.tolist())
    df.to_csv(save_path,index=False)


In [67]:
def str_to_array(x:str) -> np.array:
    return np.array(x.strip('[]').replace('\n','').split(),dtype=np.float32)

def load_embeddings(save_path:str = 'embeddings.csv') -> list[dict]:
    df = pd.read_csv(save_path)
    df['embedding'] = df['embedding'].apply(str_to_array)
    # convert to list of dicts
    chunk_list = df.to_dict(orient='records')
    # for chunk in chunk_list:
    #     chunk['embedding'] = torch.tensor(np.array(chunk['embedding'].tolist()),dtype = torch.float32).to(device)
    return chunk_list


In [71]:
from sentence_transformers import util

In [73]:
# creating a function to get top k result from embeddings
def return_relevant_top_k(query:str,
                          embeddings:torch.tensor,
                          embedding_model:SentenceTransformer = model,
                          k:int = 5,
                          device:torch.device = device
):
    """Return a top k result from embeddings based on dot product similarity with the query
    input:
        query: str Query to be searched
        embeddings: torch.tensor Embeddings in which to search
        embedding_model: SentenceTransformer = embedding_model Embedding model
        k: int = 5
        device: torch.device = device
    output:
        top_relevant_results: list[dict]
    """
    # create embedding of query
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    # dot product
    dot_scores = util.dot_score(a=query_embedding,b=embeddings)[0]
    # getting top k index and scores
    top_k_dot_products = torch.topk(dot_scores,k=k)

    return top_k_dot_products

In [74]:
def get_relevent_top_k(top_k_dot_products:torch.tensor,chunk_list:list[dict]):
    ans = []
    for score,index in zip(top_k_dot_products.values,top_k_dot_products.indices):
        relevent_chunk = chunk_list[index]
        """{{'page_no': 285,
        'text': 'Insulin has an opposing hormone called glucagon. Glucagon-secreting cells in the pancreas sense the drop in glucose and, in response, release glucagon into the blood. Glucagon communicates to the cells in the body to stop using all the glucose. More specifically, it signals the liver to break down glycogen and release the stored glucose into the blood, so that glucose levels stay within the target range and all cells get the needed fuel to function properly. Figure 4.8 The Regulation of Glucose 244  |  Digestion and Absorption of Carbohydrates',
         'sentence_count': 5, 'char_count': 549, 'token_count': 137.25, 'embedding'
        """
        ans.append({
            "score":score,
            "text":relevent_chunk["text"],
            "page_no":relevent_chunk["page_no"],
            "sentence_count":relevent_chunk["sentence_count"],
            "char_count":relevent_chunk["char_count"],
            "token_count":relevent_chunk["token_count"]
        })
    return ans

In [77]:
!pip install bitsandbytes accelerate huggingface_hub

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [79]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [78]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.utils import is_flash_attn_2_available
import torch

In [81]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    # context = "- " + "\n- ".join([f"({item['score']:.2f}) {item['text']}" for item in context_items])

    # base_prompt = f"""
    # Based on the following context with there match score with the query in the starting parenthesis, Give me the answer for the query.
    # Context : {context}

    # Query : {query}

    # Answer :
    # """
    context = ", \n".join([item['text'] for item in context_items])

    base_prompt = """Please read the following context items, and then answer the question below based on the provided context, If Context is not sufficient then answer question on your own,

Context:
{context}

Question:
{query}

Answer:"""
    base_prompt = base_prompt.format(context=context,query=query)

    # create prompt template
    dialouge_template = [
        {
            "role":"user",
            "content":base_prompt
        }
    ]

    # apply template
    prompt = tokenizer.apply_chat_template(conversation=dialouge_template,
                                          tokenize=False,
                                          add_generation_prompt=True)

    # print(f"Prompt: {prompt}")

    return prompt

In [83]:
def ask(
    query:str,
    chunk_embeddings:torch.tensor,
    chunk_list:list[dict],
    embb_model:SentenceTransformer,
    llm_model:AutoModelForCausalLM,
    tokenizer:AutoTokenizer,
    k:int = 5,
    temperature:float=0.7,
    max_new_tokens:int=512,
    device:torch.device = device if device else torch.device("cpu")
) -> str:
    # get top k results ie retrival
    top_k_results = return_relevant_top_k(query,chunk_embeddings,embb_model,k=k)
    context_items = get_relevent_top_k(top_k_results,chunk_list)

    # make prompt
    prompt = prompt_formatter(query,context_items)
    # print(prompt)

    # tokenize
    input_ids = tokenizer(prompt,return_tensors="pt").to(device)

    # generate output
    outputs = llm_model.generate(**input_ids,temperature=temperature,max_new_tokens=max_new_tokens,do_sample=True)

    # output token to text
    output_text = tokenizer.decode(outputs[0])

    return output_text.replace(prompt,'')

In [90]:
from IPython.display import display, Markdown
def prity_output(query:str,response:str):
    # remove <bos>... tags
    print(f"Query: {query}")
    print("Response: ")
    response = response.replace("<bos>","").replace("<|endoftext|>","").replace("</s>","")
    display(Markdown(response))



In [None]:
# create quantization config
# requires !pip install bitsandbytes accelerate
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# 2 falsh attention is available use it for faster attention mechanism
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8.0):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"
print(f"Attention Implementation: {attn_implementation}")

# instantiate tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it") # This line was causing the error

# instantiate model
llm_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    attn_implementation=attn_implementation
)

llm_model.to(device)

Attention Implementation: sdpa


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2SdpaAttention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attention_layernorm): Gemma2RMSNorm((2304,), 

In [91]:
query = input("Enter Question to PDF:")
prity_output(query,ask(query))

Enter Question to PDF:What fruits are good for eyes?
Query: What fruits are good for eyes?
Response: 


Based on the context, the following fruits are good for eyes:

* **Oranges**
* **Grapefruit**
* **Strawberries** 
* **Sweet red pepper**
* **Tomato** 
* **Orange juice** 
* **Broccoli**
* **Romaine Lettuce** 
* **Cauliflower** 
* **Potato** 

The context specifically mentions that lutein and zeaxanthin, antioxidants that are beneficial for eye health, are found in these fruits and vegetables. It also mentions that the foods should be eaten raw or lightly steamed to maximize vitamin C intake. 
<end_of_turn>

In [92]:
from google.colab import files
uploaded = files.upload()


KeyboardInterrupt: 

In [None]:

# use pdf or use existing embedding
pdf_name = "input.pdf"
embedding_file = "embeddings.csv"

print("Choose option from Below")
print("1. Use PDF")
print("2. Use Existing Embeddings")
choice = input("Enter your choice: ")
if choice == "1":
    # upload pdf as pdf_name


In [None]:
import random
page_list = extract_text_from_pdf(pdf_name)
chunk_list = make_chunks(page_list)

0it [00:00, ?it/s]

  0%|          | 0/1208 [00:00<?, ?it/s]

  0%|          | 0/1208 [00:00<?, ?it/s]

In [None]:
%%time
chunk_list = create_embeddings(chunk_list)

0it [00:00, ?it/s]

CPU times: user 28.4 s, sys: 55.4 ms, total: 28.4 s
Wall time: 28 s


In [None]:
len(chunk_list)

1943

In [None]:
# test save and load
save_embeddings(chunk_list)

In [None]:
chunk_list2 = load_embeddings()

In [None]:
# separate embeddings and send to device
chunk_embeddings = torch.tensor([chunk['embedding'] for chunk in chunk_list],dtype=torch.float32).to(device)
chunk_embeddings.shape

  chunk_embeddings = torch.tensor([chunk['embedding'] for chunk in chunk_list],dtype=torch.float32).to(device)


torch.Size([1943, 768])