<a href="https://colab.research.google.com/github/prajapati-oss/AI-Book-Recommendation-System/blob/main/Catboat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# src/ingest_pdf.py
import json
from pypdf import PdfReader
from tqdm import tqdm
import os
from pathlib import Path
from transformers import AutoTokenizer

def read_pdf(path):
    reader = PdfReader(path)
    pages = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        pages.append({"page": i+1, "text": text})
    return pages

def chunk_text(pages, chunk_chars=3000, overlap_chars=500):
    chunks = []
    for p in pages:
        text = p['text']
        start = 0
        while start < len(text):
            end = min(len(text), start + chunk_chars)
            chunk_text = text[start:end]
            chunks.append({
                "page": p['page'],
                "text": chunk_text,
                "start_char": start,
                "end_char": end
            })
            start = end - overlap_chars
            if start < 0:
                start = 0
    return chunks

def save_chunks(chunks, out_path):
    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, 'w', encoding='utf-8') as f:
        for c in chunks:
            f.write(json.dumps(c, ensure_ascii=False) + "\n")

# Removed the if __name__ == "__main__": block and argparse
# Call the functions directly
pdf_path ="/content/Chatboat.pdf"  # Use a path that works in Colab
out_path = "../data/rbi_chunks.jsonl"
pages = read_pdf(pdf_path)
chunks = chunk_text(pages)
save_chunks(chunks, out_path)
print(f"Saved {len(chunks)} chunks to {out_path}")



PdfReadError: Cannot find Root object in pdf

In [1]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.1.1-py3-none-any.whl (323 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/323.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.5/323.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.1.1


In [None]:
!pip install tqdm

In [None]:
# ingest.py
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import pickle
from dotenv import load_dotenv
load_dotenv()

PDF_PATH = "/content/Chatboat.pdf"

# --- text loading & chunking ---
loader = PyPDFLoader(PDF_PATH)
docs = loader.load()  # list[Document], with page content in .page_content

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

print(f"Loaded {len(docs)} pages, produced {len(chunks)} chunks.")

# --- embeddings: wrapper to call Gemini embeddings or fallback to sentence-transformers ---
def embed_texts(batch_texts):
    """Return list[list[float]] embeddings for batch_texts."""
    # Simple fallback: local sentence-transformers
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2")
    return model.encode(batch_texts, show_progress_bar=False).tolist()

# Compute embeddings and build FAISS
texts = [d.page_content for d in chunks]
metadatas = [d.metadata for d in chunks]

embs = embed_texts(texts)

# Build FAISS
import numpy as np
import faiss
vecs = np.array(embs).astype("float32")
index = faiss.IndexFlatL2(vecs.shape[1])
index.add(vecs)

# Save index + metadata + texts
faiss.write_index(index, "rbi_faiss.index")
with open("rbi_chunks.pkl", "wb") as f:
    pickle.dump({"texts": texts, "metadatas": metadatas}, f)

print("Saved FAISS index and chunk metadata.")


Loaded 330 pages, produced 855 chunks.
Saved FAISS index and chunk metadata.


In [None]:
# gemini_embeddings_template.py (snippet)
import os, requests
API_KEY = os.environ.get("GEMINI_API_KEY")  # set in .env

def gemini_embed_texts(texts, model="textembedding-gecko-001"):
    url = "https://api.generative.googleapis.com/v1beta2/embeddings:embed"  # check current URL in Gemini docs
    headers = {"Authorization": f"Bearer {API_KEY}", "Content-Type":"application/json"}
    payload = {"model": model, "input": texts}
    r = requests.post(url, json=payload, headers=headers)
    r.raise_for_status()
    data = r.json()
    # map to vector list depending on returned json shape
    return [item["embedding"] for item in data["data"]]


In [None]:
# chatbot.py
import os, pickle, faiss, numpy as np, requests
from dotenv import load_dotenv
load_dotenv()

FAISS_INDEX_PATH = "rbi_faiss.index"
CHUNKS_PATH = "rbi_chunks.pkl"
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
GEMINI_MODEL = "gemini-ultra-1.0"  # example. pick a model per pricing/docs.

# load faiss + chunks
index = faiss.read_index(FAISS_INDEX_PATH)
with open(CHUNKS_PATH, "rb") as f:
    store = pickle.load(f)
texts = store["texts"]

def retrieve(query, k=4):
    # embed query using same embedding method as used for index
    # if you used sentence-transformers:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2")
    q_emb = model.encode([query]).astype("float32")
    D, I = index.search(q_emb, k)
    results = [texts[i] for i in I[0]]
    return results

def call_gemini_prompt(prompt, max_tokens=512):
    # minimal REST call example - adjust per docs
    url = f"https://generativelanguage.googleapis.com/v1beta2/models/{GEMINI_MODEL}:generateText"
    headers = {"Authorization": f"Bearer {GEMINI_API_KEY}", "Content-Type": "application/json"}
    body = {
        "prompt": prompt,
        "max_output_tokens": max_tokens
    }
    # Added verify=False as a potential workaround for SSL errors. Not recommended for production.
    r = requests.post(url, headers=headers, json=body, verify=False)
    r.raise_for_status()
    return r.json()["candidates"][0]["content"]  # adjust path to response shape

if __name__ == "__main__":
    while True:
        q = input("\nQuestion (or 'exit'): ")
        if q.strip().lower() in ("exit","quit"): break
        print("\nAnswer:\n", answer_question(q))

def answer_question(question):
    snippets = retrieve(question, k=4)
    context = "\n\n---\n\n".join(snippets)
    prompt = f"You are a helpful assistant. Use the context below from RBI Master Direction to answer the question. If the answer is not found, reply 'I don't find a direct answer in RBI Master Direction.'\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
    resp = call_gemini_prompt(prompt)
    return resp


Question (or 'exit'): What are the four regulatory layers for NBFCs?




HTTPError: 401 Client Error: Unauthorized for url: https://generativelanguage.googleapis.com/v1beta2/models/gemini-ultra-1.0:generateText

In [None]:
import os
print(os.environ.get("GEMINI_API_KEY"))

None


In [None]:
import requests
import os
from dotenv import load_dotenv
load_dotenv()

API_KEY = os.environ.get("https://ai.google.dev/gemini-api/docs/pricing") # Correctly get API key from environment variable
MODEL = "gemini-pro"
url = f"https://generativelanguage.googleapis.com/v1beta2/models/{GEMINI_MODEL}:generateText"

def gemini_answer(prompt):
    data = {
        "contents": [{"parts":[{"text": prompt}]}]
    }
    resp = requests.post(url, json=data)
    resp.raise_for_status()
    return resp.json()["candidates"][0]["content"]["parts"][0]["text"]

print(gemini_answer("What are the four regulatory layers for NBFCs?"))

HTTPError: 403 Client Error: Forbidden for url: https://generativelanguage.googleapis.com/v1beta2/models/gemini-ultra-1.0:generateText

In [35]:
# ingest.py
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import pickle
from dotenv import load_dotenv
load_dotenv()

PDF_PATH = "/content/Chatboat.pdf"

# --- text loading & chunking ---
loader = PyPDFLoader(PDF_PATH)
docs = loader.load()  # list[Document], with page content in .page_content

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

print(f"Loaded {len(docs)} pages, produced {len(chunks)} chunks.")

# --- embeddings: wrapper to call Gemini embeddings or fallback to sentence-transformers ---
def embed_texts(batch_texts):
    """Return list[list[float]] embeddings for batch_texts."""
    # Simple fallback: local sentence-transformers
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2")
    return model.encode(batch_texts, show_progress_bar=False).tolist()

# Compute embeddings and build FAISS
texts = [d.page_content for d in chunks]
metadatas = [d.metadata for d in chunks]

embs = embed_texts(texts)

# Build FAISS
import numpy as np
import faiss
vecs = np.array(embs).astype("float32")
index = faiss.IndexFlatL2(vecs.shape[1])
index.add(vecs)

# Save index + metadata + texts
faiss.write_index(index, "rbi_faiss.index")
with open("rbi_chunks.pkl", "wb") as f:
    pickle.dump({"texts": texts, "metadatas": metadatas}, f)

print("Saved FAISS index and chunk metadata.")


Loaded 330 pages, produced 855 chunks.
Saved FAISS index and chunk metadata.


In [37]:
# chatbot.py
import os, pickle, faiss, numpy as np, requests
from google.colab import userdata

FAISS_INDEX_PATH = "rbi_faiss.index"
CHUNKS_PATH = "rbi_chunks.pkl"

# Get API key from Colab secrets (make sure you saved it as 'GEMINI_API_KEY')
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY'=='AIzaSyD5kjhdQi_nle9FlyTVTGmkCINN9Bkgen8')
print(f"Retrieved API Key (first 5 chars): {GEMINI_API_KEY[:5] if GEMINI_API_KEY else None}")

GEMINI_MODEL = "gemini-pro"

# load faiss + chunks
index = faiss.read_index(FAISS_INDEX_PATH)
with open(CHUNKS_PATH, "rb") as f:
    store = pickle.load(f)
texts = store["texts"]

def retrieve(query, k=4):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2")
    q_emb = model.encode([query]).astype("float32")
    D, I = index.search(q_emb, k)
    results = [texts[i] for i in I[0]]
    return results

def call_gemini_prompt(prompt, max_tokens=512):
    url = f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
    headers = {"Content-Type": "application/json"}
    body = {
        "contents": [{"parts":[{"text": prompt}]}],
        "generationConfig": {
            "maxOutputTokens": max_tokens,
        }
    }
    r = requests.post(url, headers=headers, json=body)
    r.raise_for_status()
    return r.json()["candidates"][0]["content"]["parts"][0]["text"]

def answer_question(question):
    snippets = retrieve(question, k=4)
    context = "\n\n---\n\n".join(snippets)
    prompt = f"You are a helpful assistant. Use the context below from RBI Master Direction to answer the question. If the answer is not found, reply 'I don't find a direct answer in RBI Master Direction.'\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
    resp = call_gemini_prompt(prompt)
    return resp

if __name__ == "__main__":
    while True:
        q = input("\nQuestion (or 'exit'): ")
        if q.strip().lower() in ("exit","quit"):
            break
        print("\nAnswer:\n", answer_question(q))


ValueError: Please enter a valid secret name