In [None]:
#Uploaded custom_logs.py in a folder named 'utils' alongwith 'config.json' and '.env'

In [1]:
!pip install gradio langchain accelerate sentence_transformers pypdf tiktoken faiss-gpu-cu11 bitsandbytes python-dotenv



In [2]:
pip install -U langchain-community



In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import pickle
import os
from utils.custom_logs import logs

class PDFLoader:
    """
    A class for loading data from a PDF file.
    """

    def __init__(self, filepath):
        """
        Initialize the PDFLoader instance.

        Args:
            filepath (str): Path to the PDF file to load.
        """
        self.filepath = filepath

    def dataloader(self):
        """
        Load data from the PDF file.

        Returns:
            list: List of pages from the PDF.
        """
        logs.info(f"Reading file {os.path.basename(self.filepath)} ... ")
        loader = PyPDFLoader(self.filepath)
        pages = loader.load()
        return pages

class Splitter:
    """
    A class for splitting data into chunks.
    """

    def __init__(self, chunk_size, chunk_overlap):
        """
        Initialize the Splitter instance.

        Args:
            chunk_size (int): Size of each chunk.
            chunk_overlap (int): Overlap between consecutive chunks.
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def datasplitter(self, pages):
        """
        Split data into chunks.

        Args:
            pages (list): List of data pages.

        Returns:
            list: List of split documents.
        """
        logs.info(f"Document splitting with chunk_size {self.chunk_size} and chunk_overlap {self.chunk_overlap} ... ")
        text_splitter = TokenTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len
        )
        docs = text_splitter.split_documents(pages)
        return docs

class Embedder:
    """
    A class for managing document embeddings.
    """

    def __init__(self, model_name):
        """
        Initialize the Embedder instance.

        Args:
            model_name (str): Name of the embedding model.
        """
        self.model_name = model_name
        logs.info(f"Loading embeddings Model {self.model_name} ... ")
        self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)

    def create_embeddings(self, docs):
        """
        Create embeddings for documents.

        Args:
            docs (list): List of documents.

        Returns:
            FAISS: Document embeddings.
        """
        logs.info(f"Creating document embeddings for {len(docs)} split ... ")
        self.doc_embedding = FAISS.from_documents(docs, self.embeddings)
        return self.doc_embedding

    def save_embedding(self, filename):
        """
        Save document embeddings to a file.

        Args:
            filename (str): Name of the file to save the embeddings.
        """
        embedding_dir = "embeddings_data"
        if not os.path.exists(embedding_dir):
            os.mkdir(embedding_dir)
        filename = os.path.basename(filename)
        logs.info(f"Saving document embeddings: {'embeddings_data/'+filename} ... ")
        with open("embeddings_data/"+filename+".pkl", "wb") as f:
            pickle.dump(self.doc_embedding, f)

    def load_embedding(self, filename):
        """
        Load document embeddings from a file.

        Args:
            filename (str): Name of the file to load the embeddings.

        Returns:
            FAISS: Loaded document embeddings.
        """
        filename = os.path.basename(filename)
        logs.info(f"Loading document embeddings locally: {'embeddings_data/'+filename} ... ")
        with open("embeddings_data/"+filename+".pkl", "rb") as f:
            self.doc_embedding = pickle.load(f)
        return self.doc_embedding

    def check_embedding_available(self, filename):
        """
        Check if document embeddings are available in a file.

        Args:
            filename (str): Name of the file to check.

        Returns:
            bool: True if document embeddings are available, False otherwise.
        """
        filename = os.path.basename(filename)
        doc_check = os.path.isfile("embeddings_data/"+filename+".pkl")
        logs.info(f"Is document embedding found: {doc_check}")
        return doc_check

class DocProcessor:
    """
    A class for processing documents and managing embeddings.
    """

    def __init__(self, model_name, chunk_size, chunk_overlap):
        """
        Initialize the DocProcessor instance.

        Args:
            model_name (str): Name of the embedding model.
            chunk_size (int): Size of each chunk.
            chunk_overlap (int): Overlap between consecutive chunks.
        """
        logs.info(f"Initializing document processor parameters - embedding model_name: {model_name}, chunk_size: {chunk_size}, chunk_overlap: {chunk_overlap} ... ")
        self.model_name = model_name
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.embedding_manager = Embedder(model_name)

    def process_document(self, filepath):
        """
        Process a document and manage embeddings.

        Args:
            filepath (str): Path to the document file.

        Returns:
            FAISS: Document embeddings.
        """
        if self.embedding_manager.check_embedding_available(filepath):
            return self.embedding_manager.load_embedding(filepath)
        else:
            data_loader = PDFLoader(filepath)
            pages = data_loader.dataloader()

            data_splitter = Splitter(self.chunk_size, self.chunk_overlap)
            docs = data_splitter.datasplitter(pages)

            doc_embedding = self.embedding_manager.create_embeddings(docs)
            self.embedding_manager.save_embedding(filepath)
            return doc_embedding

In [4]:
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import torch
from utils.custom_logs import logs

class ModelRetriever:
    """
    A class responsible for loading the language model.
    """

    def __init__(self, model_id, max_length, temperature, load_int8, hf_token=None):
        """
        Initialize the ModelRetriever instance.

        Args:
            model_id (str): Identifier of the pretrained model.
            max_length (int): Maximum length of generated text.
            temperature (float): Temperature parameter for text generation.
            load_int8 (bool): Whether to load the model in 8-bit.
            hf_token (str, optional): Hugging Face token for authentication. Defaults to None.
        """
        self.model_id = model_id
        self.max_length = max_length
        self.temperature = temperature
        self.load_int8 = load_int8
        self.hf_token = hf_token

    def load_model(self):
        """
        Load the language model using the specified model_id, max_length, and temperature.

        Returns:
            HuggingFacePipeline: Loaded language model.
        """
        logs.info(f"Loading LLM model {self.model_id} with max_length {self.max_length} and temperature {self.temperature}...\n")
        tokenizer = AutoTokenizer.from_pretrained(self.model_id, token=self.hf_token)
        if self.load_int8:
            model = AutoModelForCausalLM.from_pretrained(self.model_id, load_in_8bit=True, device_map="auto", token=self.hf_token)
        else:
            model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16, device_map="auto", token=self.hf_token)

        logs.info("Model is loaded successfully\n")
        pipe = pipeline(
            "text-generation", model=model, tokenizer=tokenizer, max_length=self.max_length, temperature=self.temperature
        )
        llm = HuggingFacePipeline(pipeline=pipe)
        return llm

class QASys:
    """
    A class representing a Question Answering (QA) system.
    """

    def __init__(self, llm):
        """
        Initialize the QASys instance.

        Args:
            llm (HuggingFacePipeline): Loaded language model for text generation.
        """
        self.llm = llm

        self.prompt_template = """Use the following pieces of context to answer the question at the end.
        If you don't know the answer, just say that you don't know, don't try to make up an answer.

        {context}

        Question: {question}
        Answer :"""
        Prompt = PromptTemplate(
            template=self.prompt_template, input_variables=["context", "question"]
        )
        self.chain_type_kwargs = {
            "prompt": Prompt,
        }

    def setup_retrieval_qa(self, doc_embedding):
        """
        Set up the retrieval-based QA system.

        Args:
            doc_embedding: Document embedding for retrieval.

        Returns:
            RetrievalQA: Configured retrieval-based QA system.
        """
        logs.info("Setting up retrieval QA system...\n")
        qa = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",  # Replace this with the appropriate chain type.
            retriever=doc_embedding.as_retriever(),
            chain_type_kwargs=self.chain_type_kwargs,
        )

        return qa

## HF TOKEN generated from HuggingFace account.

In [5]:
from huggingface_hub import login
from dotenv import load_dotenv
from google.colab import userdata
import os

# Load variables from the .env file (if it exists)
load_dotenv('.env')

# Access the Hugging Face token from Colab secrets
hf_token = userdata.get("HF_TOKEN")

# Log in to Hugging Face
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [6]:
import gradio as gr
import json
import re
from utils.custom_logs import logs
from huggingface_hub import login
from dotenv import load_dotenv
from google.colab import userdata
import os

with open('config.json', 'r') as config_file:
    config = json.load(config_file)

logs.info(f"Loaded config file: {config}")

# Load variables from the .env file (if it exists)
load_dotenv('.env')

# Access the Hugging Face token from Colab secrets
hf_token = userdata.get("HF_TOKEN")

# Login to Hugging Face
login(token=hf_token)

# Loading embedding model
document_processor = DocProcessor(model_name=config["embedding_model_name"], chunk_size=config["chunk_size"], chunk_overlap=config["chunk_overlap"])

# Load model globally
model_loader = ModelRetriever(config["model_id"], config["max_length"], config["temperature"], config['load_int8'], hf_token=hf_token)
llm = model_loader.load_model()

qa_system = QASys(llm)

# Initialize global variable for doc_embedding
doc_embedding = None
pdf_filename = None
qa = None
def chatbot(pdf_file,query):
    global doc_embedding
    global pdf_filename
    global qa
    if pdf_filename is None or pdf_filename!= pdf_file.name or doc_embedding is None:
        logs.info("New PDF Found Resetting doc_embedding")
        pdf_filename = pdf_file.name
    if doc_embedding is None:
        logs.info("Starting for new doc_embedding")
        doc_embedding = document_processor.process_document(pdf_file.name)
        qa = qa_system.setup_retrieval_qa(doc_embedding)
    result = qa({"query": query})
    return re.sub(r'\n+', '\n', result['result'])

with gr.Blocks(theme=gr.themes.Default(primary_hue="red", secondary_hue="pink")) as demo:
    gr.Markdown("# Ask your Question to PDF Document")
    with gr.Row():
        with gr.Column(scale=4):
            pdf_file = gr.File(label="Upload your PDF")
    output = gr.Textbox(label="output",lines=3)
    query = gr.Textbox(label="query")
    btn = gr.Button("Submit")
    btn.click(fn=chatbot, inputs=[pdf_file,query], outputs=[output])
gr.close_all()
demo.launch(share=True)

2025-07-31 01:11:55,694 - INFO - ipython-input-4288507256.py:13 - Loaded config file: {'embedding_model_name': 'thenlper/gte-base', 'model_id': 'meta-llama/Llama-2-7b-chat-hf', 'chunk_size': 500, 'chunk_overlap': 50, 'max_length': 2000, 'temperature': 0.05, 'load_int8': True}
INFO:custom_logger:Loaded config file: {'embedding_model_name': 'thenlper/gte-base', 'model_id': 'meta-llama/Llama-2-7b-chat-hf', 'chunk_size': 500, 'chunk_overlap': 50, 'max_length': 2000, 'temperature': 0.05, 'load_int8': True}
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
2025-07-31 01:11:56,574 - INFO - ipython-input-1544690725.py:160 - Initializing document processor parameters - embedding model_name: thenlper/gte-base, chunk_size: 500, chunk_overlap: 50 ... 
INFO:custom_logger:Initializing document processor parameters - embedding model_name: thenlper/gte-base, chunk_size: 500, chunk_overlap: 50 ... 
2025-07-31 01:11:56,578 - 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2025-07-31 01:13:39,607 - INFO - ipython-input-3456520988.py:44 - Model is loaded successfully

INFO:custom_logger:Model is loaded successfully

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://79d2482421201d87ca.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [15]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c