In [1]:
!nvidia-smi

Sun Apr 14 06:20:13 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8              30W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

#### Minimal code to run QA-RAG pipeline. Complete end-to-end setup present as FastAPI in docker

#### Package Installation

In [4]:
!pip install --quiet PyMuPDF==1.24.0 # PDF reader
!pip install --quiet langchain==0.1.16
!pip install --quiet langchain-community==0.0.32

!pip install --quiet transformers==4.39.3
!pip install --quiet tiktoken==0.6.0
!pip install --quiet sentence-transformers==2.6.1
!pip install --quiet chromadb==0.4.24

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.5/287.5 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.0/113.0 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

### Importing packages

In [2]:
import os
import requests
from tqdm import tqdm
from typing import List

import fitz # PDF reader
import torch

from langchain.docstore.document import Document
from langchain_text_splitters import TokenTextSplitter

from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain.chains import RetrievalQA

from huggingface_hub import notebook_login
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer

## Pipeline Component

#### Downloading file from url

In [3]:
def download_file_from_url(url:str, file_name:str) -> None:

  with open(file_name, "wb") as f:
      response = requests.get(url, stream=True)
      total_length = response.headers.get('content-length')

      for data in tqdm(response.iter_content(chunk_size=4096)):
            f.write(data)

      print(f"\nDownloaded pdf file at {file_name}")

#### Reading PDF pages
Reading first 2 chapter pages only using start & end page index

In [4]:
def read_page_text(page) -> str:
    """Extract text from page block as they appers. Returns cleaned formated page text"""

    page_text = ""
    # Sorts page block as they appear to human eyes. Extracts only text i.e. present 4th index in block tuple

    blocks = page.get_text_blocks(sort=True)
    for block in blocks:
        page_text = page_text + "\n" + block[4].replace("\n", " ")

    return page_text

In [5]:
def pdf_to_langchain_docs(file_path:str, page_start_idx, page_end_idx ) -> List[Document]:
    """Extracts & format text from pdf pages. Returns list of Langchain document"""

    documents = []
    pdf_doc = fitz.open(file_path)

    for page_idx, page in enumerate(pdf_doc):
        if page_start_idx <= page_idx <= page_end_idx:
            page_text = read_page_text(page)
            page_meta = {"page_number": page_idx + 1}
            lang_chain_doc = Document(page_content=page_text, metadata=page_meta)

            documents.append(lang_chain_doc)

    return documents

#### Chunking documents using tokenpsplitter

In [6]:
def split_documents(documents: List[Document], size:int=128, overlap:int=28) -> List[Document]:

    doc_splitter = TokenTextSplitter(chunk_size=size, chunk_overlap=overlap)

    splitted_documents = doc_splitter.split_documents(documents)

    return splitted_documents

#### Loading embeddgin model from hugging face

In [7]:
def get_hf_embedder(hf_model_id: str) -> HuggingFaceEmbeddings:
    hf_embedder = HuggingFaceEmbeddings(model_name=hf_model_id)

    return hf_embedder

#### Vectore Retriver using in memory Chroma DB

In [8]:
def get_vector_retriver(documents: List[Document], embedder: HuggingFaceEmbeddings, collection_name: str):

    vector_store = Chroma.from_documents(documents=documents, embedding=embedder, collection_name=collection_name)

    vector_retriever = vector_store.as_retriever()

    return vector_retriever

#### Hugging Face QA llm based pipeline

In [10]:
def get_qa_llm_pipe(llm_qa_model_id:str, max_new_tokens:int=100) -> HuggingFacePipeline:

    device = 0 if torch.cuda.is_available() else -1

    hf_qa_pipe = HuggingFacePipeline.from_model_id(
    model_id=llm_qa_model_id,
    task="text-generation",
    device=device,
    pipeline_kwargs={"max_new_tokens": max_new_tokens, "return_full_text":False, "num_return_sequences":1})

    return hf_qa_pipe


## Creating QA-RAG pipeline

By combining all above pipe components

In [11]:
pdf_file_path = "ConceptsofBiology-WEB.pdf"
pdf_url = "https://assets.openstax.org/oscms-prodcms/media/documents/ConceptsofBiology-WEB.pdf"

page_start_idx = 18
page_end_idx = 68

chunk_size = 128
chunk_overlap = 32

vector_collection_name = "Biology1"
hf_embedder_model_id = "sentence-transformers/all-MiniLM-L12-v2"

hf_qa_llm = "google/gemma-1.1-2b-it"

In [12]:
# Downloading PDF file from URL

download_file_from_url(pdf_url, pdf_file_path)

37398it [00:01, 25236.53it/s]


Downloaded pdf file at ConceptsofBiology-WEB.pdf





In [13]:
# HF login required to download model

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
documents = pdf_to_langchain_docs(pdf_file_path, page_start_idx, page_end_idx)
splitted_documents = split_documents(documents, chunk_size, chunk_overlap)
hf_embedder = get_hf_embedder(hf_embedder_model_id)

vector_retriver = get_vector_retriver(splitted_documents, hf_embedder, vector_collection_name)

hf_qa_pipe = get_qa_llm_pipe(hf_qa_llm)

qa_rag_pipeline = RetrievalQA.from_chain_type(
    llm=hf_qa_pipe,
    chain_type="stuff",
    retriever=vector_retriver,
    verbose=True,
    return_source_documents=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
def extract_answer(qa_pipe, question:str) -> str:
    response = qa_pipe.invoke(question)
    answer = response["result"].strip()

    return answer

## Ask Question

In [20]:
question = "How many American men were diagnosed with syphilis and which year"

extract_answer(qa_rag_pipeline, question)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'399 African American men were diagnosed with syphilis in 1932.'