<a href="https://colab.research.google.com/github/ratnesh003/HCLTech-Tasks/blob/main/Silver%20Badge%20Assignments/Assignment%201%20/HCLTech_ML_Assignment_2_Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 1

Use any other open-source model in place of openai and do the summarization of the pdf document using Lang chain and the concept of RAG. use chromadb vector database for this

### Installing Dependencies

In [None]:
!pip install \
  requests \
  opentelemetry-api \
  opentelemetry-sdk \
  opentelemetry-proto \
  opentelemetry-exporter-otlp-proto-common \
  langchain \
  langchain-community \
  langchain-huggingface \
  chromadb \
  pypdf \
  sentence-transformers \
  transformers \
  accelerate \
  bitsandbytes

### Upload the PDF

In [None]:
from google.colab import files
from langchain.document_loaders import PyPDFLoader

# Prompt user to upload a PDF
uploaded = files.upload()

# Get the uploaded file name (assumes one PDF)
pdf_path = next(iter(uploaded))

# Load PDF
loader = PyPDFLoader(pdf_path)
documents = loader.load()

print(f"Loaded PDF: {pdf_path}")
print(f"Total pages loaded: {len(documents)}")

### create the chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,          # Large enough for semantic coherence
    chunk_overlap=300,        # Prevents abrupt cut-offs
    separators=[
        "\n\n",               # Paragraphs
        "\n",                 # Lines
        ".",                  # Sentences
        " ",                  # Words
        ""
    ]
)

split_docs = text_splitter.split_documents(documents)

print(f"Total chunks created: {len(split_docs)}")


### Load the embedding model

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


### Setup the chromaDB

In [None]:
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=split_docs,
    embedding=embedding_model,
    persist_directory="/content/chroma_db"
)

vectorstore.persist()


### Load the Mistral model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import BitsAndBytesConfig
import torch

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_id)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
)

model.config.pad_token_id = tokenizer.pad_token_id

text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.3,
    repetition_penalty=1.1
)


### Setting up the text generation pipeline

In [None]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


### Setting up the retrieval configurations

In [None]:
from langchain.chains import RetrievalQA

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)


### Adding Query and prompt for the generation

In [None]:
# Ask user for a custom query
query = input(
    "Enter your question about the uploaded PDF:\n"
).strip()

if not query:
    raise ValueError("Query cannot be empty.")

docs = retriever.get_relevant_documents(query)
context = "\n\n".join(doc.page_content for doc in docs)

prompt = f"""
You are a senior data engineer.

Using the context below, answer the question clearly and technically.
Do NOT repeat the context.
Do NOT include headings.

Context:
{context}

Question:
{query}

Answer:
"""

### Streamming the text generation

In [None]:
from transformers import TextIteratorStreamer
from threading import Thread

MAX_INPUT_TOKENS = 8192

# Create streamer
streamer = TextIteratorStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=True
)

# Tokenize input WITH attention mask
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=MAX_INPUT_TOKENS
).to(model.device)

# Background generation
thread = Thread(
    target=model.generate,
    kwargs={
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "streamer": streamer,
        "max_new_tokens": 512,
        "do_sample": True,          # REQUIRED for temperature
        "temperature": 0.3,
        "repetition_penalty": 1.1,
        "pad_token_id": tokenizer.pad_token_id
    }
)

thread.start()

# Stream answer tokens only
print("\nAnswer:\n")
for token in streamer:
    print(token, end="", flush=True)

### Listing the references used for answer generation

In [None]:
print("\n\nReferences used from the document:\n")

for i, doc in enumerate(docs, 1):
    source = doc.metadata.get("source", "Unknown source")
    page = doc.metadata.get("page", "Unknown page")
    print(f"[{i}] Source: {source}, Page: {page}")