<a href="https://colab.research.google.com/github/rajaranjith/HCL-GenAI-Training/blob/main/GenAI-Application-Developer-FY26-SilverBadge-19Nov2025-Ass1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain langchain-core langchain-community langchain-huggingface langchain-text-splitters
!pip install transformers sentence-transformers accelerate bitsandbytes
!pip install chromadb
!pip install pypdf
!pip install --upgrade langchain langchain-core langchain-community

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.0.1-py3-none-any.whl.metadata (2.1 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
INFO: pip is looking at multiple versions of langchain-huggingface to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-huggingface
  Downloa

In [None]:
# ---- Imports
import os
os.environ["PYDEVD_DISABLE_FILE_VALIDATION"] = "1"
import shutil
from pathlib import Path
import torch
from google.colab import files

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import BitsAndBytesConfig

import langchain_community
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_huggingface import HuggingFacePipeline
from langchain_core.prompts import ChatPromptTemplate

In [None]:
import langchain
print(langchain.__version__)
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

1.0.5


ModuleNotFoundError: No module named 'langchain.chains'

In [None]:
# ---- Configuration (you can tweak these)
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"   # small + good quality; alternatives in comments below
# Other good small open instruct models:
#   "Qwen/Qwen2.5-1.5B-Instruct"        (multilingual, small)
#   "google/gemma-2-2b-it"              (note license constraints for some use cases)
#   "TinyLlama/TinyLlama-1.1B-Chat-v1.0" (very small; lower quality)
MAX_NEW_TOKENS = 800
TEMPERATURE = 0.2
TOP_P = 0.9
CHUNK_SIZE = 1200
CHUNK_OVERLAP = 150
RETRIEVAL_K = 18    # increase for more coverage across the whole PDF
PERSIST_DIR = "/content/chroma_pdf_index"
TARGET_BULLETS = 8  # how many key bullet points you want in the output

# ---- Device & precision helpers
def get_device():
    if torch.cuda.is_available():
        return "cuda"
    return "cpu"

device = get_device()
print(f"[Info] Using device: {device}")

# ---- Build LLM with smart fallback to quantization
def build_llm(model_id=MODEL_ID):
    print(f"[Info] Loading model: {model_id}")
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

    model = None
    pipeline_kwargs = {}

    if device == "cuda":
        # Try 4-bit quantization to fit comfortably on Colab GPUs
        try:
            print("[Info] Trying 4-bit quantization (bitsandbytes)...")
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_quant_type="nf4",
            )
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                trust_remote_code=True,
                quantization_config=bnb_config,
                device_map="auto",
            )
        except Exception as e:
            print(f"[Warn] 4-bit failed: {e}\n[Info] Falling back to float16 on GPU...")
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                trust_remote_code=True,
                torch_dtype=torch.float16,
                device_map="auto",
            )
    else:
        print("[Warn] No GPU detected. Running on CPU; this will be slower.")
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True,
        )

    gen_pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        repetition_penalty=1.05,
        pad_token_id=tokenizer.eos_token_id,
    )
    return HuggingFacePipeline(pipeline=gen_pipe)

# ---- Build vector store (Chroma) from uploaded PDF
def build_vectorstore(pdf_path: str, persist_dir: str = PERSIST_DIR, rebuild: bool = True) -> Chroma:
    persist = Path(persist_dir)
    if rebuild and persist.exists():
        shutil.rmtree(persist, ignore_errors=True)

    # 1) Load PDF
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()  # retains metadata including page numbers

    # 2) Chunk
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", " ", ""]
    )
    splits = splitter.split_documents(docs)

    # 3) Embeddings (open-source)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # 4) Vector store (persisted)
    vectordb = Chroma.from_documents(
        documents=splits,
        embedding=embeddings,
        collection_name="pdf_summary",
        persist_directory=str(persist),
    )
    vectordb.persist()
    return vectordb

[Info] Using device: cuda


In [None]:
# ---- Build RAG chain (retriever + summarization prompt)
def build_rag_chain(vectordb: Chroma, llm: HuggingFacePipeline):
    retriever = vectordb.as_retriever(
        search_type="mmr",  # Maximal Marginal Relevance to reduce redundancy
        search_kwargs={"k": RETRIEVAL_K, "fetch_k": max(RETRIEVAL_K*3, 40), "lambda_mult": 0.5}
    )

    prompt = ChatPromptTemplate.from_template(
        """You are an expert at summarizing long PDFs for busy professionals.\nFollow these rules:\n- Base your summary ONLY on the retrieved context excerpts.\n- Organize with clear section headings and short paragraphs.\n- Include {target_bullets} key bullet points focusing on the most important insights.\n- If page numbers appear in the excerpts' metadata, cite them like (p. X).\n- Finish with a crisp TL;DR.\n\n<context>\n{context}\n</context>\n\nUser request:\n{input}\n"""
    )

    document_chain = create_stuff_documents_chain(llm, prompt)
    rag_chain = create_retrieval_chain(retriever, document_chain)
    return rag_chain

In [None]:
# ---- Let user upload a PDF
print("📄 Please upload your PDF…")
uploaded = files.upload()
if not uploaded:
    #raise ValueError("No file uploaded.")
pdf_path = list(uploaded.keys())[0]
print(f"[Info] Using PDF: {pdf_path}")
# ---- Build components and run RAG summarization
llm = build_llm(MODEL_ID)
vectordb = build_vectorstore(pdf_path, PERSIST_DIR, rebuild=True)
rag_chain = build_rag_chain(vectordb, llm)

user_request = (
    f"Provide a comprehensive summary of the entire PDF for a time-pressed reader. "
    f"Include around {TARGET_BULLETS} key bullet points, note major sections/themes, "
    f"highlight critical figures/tables if present, and cite page numbers when possible."
)

print("\n[Info] Running RAG summarization… this can take a minute.")
result = rag_chain.invoke({"input": user_request, "target_bullets": TARGET_BULLETS})

summary = result.get("answer", "").strip()
print("\n" + "="*80)
print("DOCUMENT SUMMARY (RAG + Open-Source LLM)")
print("="*80 + "\n")
print(summary)

# ---- Save and download the summary
out_path = "summary.md"
with open(out_path, "w", encoding="utf-8") as f:
    f.write("# Document Summary (RAG)\n\n")
    f.write(summary + "\n")
print(f"\n[Info] Saved summary to: {out_path}")

print("\n📥 Downloading summary.md …")
files.download(out_path)

[Info] Loading model: microsoft/Phi-3-mini-4k-instruct


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

[Info] Trying 4-bit quantization (bitsandbytes)...


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Device set to use cuda:0


ValueError: File path ['/sample_data/resume.pdf: No such file or directory'] is not a valid file or url