#

In [None]:
%pip install fitz langchain sentence_transformers scikit-learn openai  scikit-learn  faiss-cpu tools pdfplumber
!pip install openai==0.28

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting tools
  Downloading tools-1.0.2-py3-none-any.whl.metadata (1.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB

## Reading PDF

In [None]:
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import openai
import pdfplumber # Pdf extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# PDF to Text extraction
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()

In [None]:
# chunk Split  overlap
def chunk_text(text, chunk_size=500, overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return splitter.split_text(text)


In [None]:
# embeding  - SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def embed_texts(texts):
    return model.encode(texts, show_progress_bar=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# key word retrieval
def bm25_index(text_chunks):
    vectorizer = TfidfVectorizer()
    bm25_matrix = vectorizer.fit_transform(text_chunks)
    return vectorizer, bm25_matrix


In [None]:
# Semantic seach , meaning based retrieval
def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index


In [None]:
def hybrid_retrieve(query, text_chunks, faiss_index, embeddings, vectorizer, bm25_matrix, top_k=5):
    # BM25
    query_bm25 = vectorizer.transform([query])
    bm25_scores = cosine_similarity(query_bm25, bm25_matrix).flatten()

    # Embedding-based
    query_embed = model.encode([query])
    _, faiss_indices = faiss_index.search(np.array(query_embed), top_k)
    faiss_scores = np.zeros(len(text_chunks))
    for idx in faiss_indices[0]:
        faiss_scores[idx] += 1  # simple boosting

    # Combine scores
    combined_scores = bm25_scores + faiss_scores
    top_indices = combined_scores.argsort()[-top_k:][::-1]
    return [text_chunks[i] for i in top_indices]


In [None]:

openai.api_key = ""

def query_chatgpt_4o(query, context_chunks):
    context = "\n\n".join(context_chunks)
    prompt = f"""Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}"""

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You're a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']


In [None]:
# Step 1: Load and chunk
text = extract_text_from_pdf("/content/Sachin_bio.pdf")
chunks = chunk_text(text)

# Step 2: Indexing
embeddings = embed_texts(chunks)
vectorizer, bm25_matrix = bm25_index(chunks)
faiss_index = build_faiss_index(np.array(embeddings))

# Step 3: Hybrid retrieval and answer generation
query = "What he did at age of 15?"
top_chunks = hybrid_retrieve(query, chunks, faiss_index, embeddings, vectorizer, bm25_matrix)
answer = query_chatgpt_4o(query, top_chunks)

print("Answer:\n", answer)




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer:
 At the age of 15, Sachin Tendulkar made his debut in first-class cricket for Mumbai in the Ranji Trophy, scoring a century in his debut match against Gujarat. This achievement made him the youngest Indian to score a century in the Ranji Trophy at that time.
