<a href="https://colab.research.google.com/github/shirishagugulothu/LLM_Assignment/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries: Huggingface Transformers, a CPU version of FAISS (for nearest neighbor search),
# PyMuPDF for PDF file reading, and Sentence‑Transformers for embeddings.
!pip install transformers faiss-cpu pymupdf sentence-transformers

# Upload a PDF file through Colab's file-picker interface
from google.colab import files
uploaded = files.upload()  # Triggers file picker to upload PDF
pdf_path = list(uploaded.keys())[0]  # Get the filename of the uploaded PDF

# Use PyMuPDF (fitz) to open and read the PDF text
import fitz
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)  # Open the PDF document
    text = ""
    for page in doc:
        text += page.get_text()  # Append text from each page
    return text

document_text = extract_text_from_pdf(pdf_path)  # Extract all text from the PDF

# Split the full document text into smaller chunks
import textwrap
def chunk_text(text, chunk_size=300):
    return textwrap.wrap(text, width=chunk_size)  # Wrap text into pieces of ~300 characters

chunks = chunk_text(document_text)

print(f"Number of chunks: {len(chunks)}")  # Show how many chunks were created
print("sample_chunk:", chunks[0])           # Print the first chunk as an example

# Load a pre-trained sentence embedding model for semantic vectors
from sentence_transformers import SentenceTransformer
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight model

# Generate embeddings (numerical vectors) for each text chunk
embeddings = embed_model.encode(chunks)

# Set up FAISS index for fast similarity search
import faiss
import numpy as np
dim = embeddings[0].shape[0]          # Determine dimension size from embedding
index = faiss.IndexFlatL2(dim)        # Use an index that computes L2 distance
index.add(np.array(embeddings))       # Populate index with all chunk embeddings

# Prepare a text-to-text generation pipeline using a small-sized T5 model
from transformers import pipeline
generator = pipeline("text2text-generation", model="google/flan-t5-small")

# Define a function to retrieve relevant chunks and generate an answer
def retrieve_and_answer(query, top_k=1):
    query_embedding = embed_model.encode([query])  # Embed the user query
    _, indices = index.search(                    # Search for nearest chunk(s)
        np.array(query_embedding).astype("float32"),
        top_k
    )
    retrieved_texts = [chunks[i] for i in indices[0]]  # Extract matching chunks
    context = " ".join(retrieved_texts)                # Join chunks into one context string
    prompt = f"context: {context} \n\n Question: {query}\nAnswer:"  # Build prompt
    result = generator(prompt, max_length=80)           # Generate answer (up to 80 tokens)
    return result[0]['generated_text']                  # Return generated text

# Example usage: ask about "temperature tuning" and print the answer
Question = "What is temperature tuning"
Answer = retrieve_and_answer(Question)
print("Q:", Question)
print("A:", Answer)
