In [3]:
import os
import numpy as np
from PyPDF2 import PdfReader
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Path to a single PDF file
pdf_path = r"C:\Users\LAKSHMI_SRAVANTHI\Downloads\sithafalpdf.pdf"

# Check if the file exists
if not os.path.isfile(pdf_path):
    raise FileNotFoundError(f"The file '{pdf_path}' does not exist. Please provide a valid PDF file.")

# Load Sentence-BERT model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract text from specific pages of a PDF
def extract_text_from_pdf(pdf_path, pages=[2, 6]):
    text_chunks = []
    reader = PdfReader(pdf_path)
    
    # Check if the pages list is within bounds of the PDF
    total_pages = len(reader.pages)
    for page_num in pages:
        if page_num <= total_pages:
            text = reader.pages[page_num - 1].extract_text()  # Page numbers are 1-based
            if text:
                text_chunks.append(text)
    return text_chunks

# Function to chunk text into smaller parts for processing
def chunk_text(text, max_chunk_size=500):
    words = text.split()
    chunks = []
    current_chunk = []
    for word in words:
        current_chunk.append(word)
        if len(current_chunk) >= max_chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to store embeddings in a NearestNeighbors model
def store_embeddings_in_knn(embeddings, metadata):
    knn = NearestNeighbors(n_neighbors=min(5, len(embeddings)), algorithm='auto', metric='cosine')
    knn.fit(embeddings)
    return knn, metadata

# Process the single PDF file and generate embeddings
all_embeddings = []
metadata = []

# Extract text from specific pages (e.g., 2 and 6)
text_chunks = extract_text_from_pdf(pdf_path)
for text in text_chunks:
    chunks = chunk_text(text)
    embeddings = model.encode(chunks)  # Use Sentence-BERT for embeddings
    all_embeddings.extend(embeddings)
    metadata.extend([(os.path.basename(pdf_path), chunk) for chunk in chunks])

# Check if embeddings are generated correctly
print(f"Total embeddings generated: {len(all_embeddings)}")

if len(all_embeddings) == 0:
    raise ValueError("No embeddings were generated. Please check the PDF content and extraction process.")

# Convert embeddings to numpy array
all_embeddings = np.array(all_embeddings)

# Store embeddings using NearestNeighbors
knn, metadata = store_embeddings_in_knn(all_embeddings, metadata)

# Query Handling function
def query_knn(query, knn, metadata, top_k=5):
    query_embedding = model.encode([query]).reshape(1, -1)
    
    # Dynamically adjust n_neighbors if there are fewer than 5 embeddings
    n_neighbors = min(top_k, len(all_embeddings))
    
    distances, indices = knn.kneighbors(query_embedding, n_neighbors=n_neighbors)
    results = [(metadata[idx], distances[0][i]) for i, idx in enumerate(indices[0])]
    return results

# Example Query: Retrieve information
query = "What is Task 2?"
results = query_knn(query, knn, metadata)
print(query)
for (file_chunk, score) in results:
    print(f"File: {file_chunk[0]}, Chunk: {file_chunk[1]}, Score: {score}")


Total embeddings generated: 1
What is Task 2?
File: sithafalpdf.pdf, Chunk: Task 2: Chat with Website Using RAG Pipeline Overview The goal is to implement a Retrieval-Augmented Generation (RAG) pipeline that allows users to interact with structured and unstructured data extracted from websites. The system will crawl, scrape, and store website content, convert it into embeddings, and store it in a vector database. Users can query the system for information and receive accurate, context-rich responses generated by a selected LLM. Functional Requirements 1. Data Ingestion • Input: URLs or list of websites to crawl/scrape. • Process: o Crawl and scrape content from target websites. o Extract key data ﬁelds, metadata, and textual content. o Segment content into chunks for better granularity. o Convert chunks into vector embeddings using a pre-trained embedding model. o Store embeddings in a vector database with associated metadata for eFicient retrieval. 2. Query Handling • Input: User's na