# Import Libraries and Datasets

In [1]:
from langchain.document_loaders import PyPDFLoader

from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai  as genai

import faiss
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize

import os
import pickle
from pathlib import Path
from tqdm import tqdm

import glob
from datetime import datetime

import re
import json

## API KEY SETUP

In [2]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

## NLTK Downloads

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Deep\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Deep\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Deep\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Deep\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Deep\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
def create_project_directories():
    directories = [
        'data/raw',
        'data/processed',
        'data/faiss_index',
        'logs',
        'config'
    ]

    for directory in directories:
        Path(directory).mkdir(parents = True, exist_ok = True)
    print("✅ Project directories created !")

create_project_directories()

✅ Project directories created !


In [5]:
def validate_setup():
    # Test Gemini
    try:
        model = genai.GenerativeModel('gemini-2.5-pro')
        response = model.generate_content("Test")
        print("✅ Gemini Working")
    except:
        print("❌ Gemini Failed")
    
    # Test NLTK
    try:
        sent_tokenize("Test sentence.")
        print("✅ NLTK Working")
    except:
        print("❌ NLTK Failed")
    
    # Test FAISS
    try:
        index = faiss.IndexFlatIP(384)
        print("✅ FAISS working")
    except:
        print("❌ FAISS Failed")

In [6]:
validate_setup()

✅ Gemini Working
✅ NLTK Working
✅ FAISS working


# Document Loading & Initial Processing

In [7]:
def load_single_pdf(file_path):
    "Load a single PDF"
    try:
        loader = PyPDFLoader(file_path)
        pages = loader.load()

        full_text = '\n'.join([page.page_content for page in pages])

        # Extract
        metadata = {
            'filename': os.path.basename(file_path),
            'file_path': file_path,
            'total_pages': len(pages),
            'total_chars': len(full_text)
        }

        return full_text, metadata
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None, None

In [8]:
def load_all_documents(document_folder):
    "Load all PDFs from a folder"
    pdf_files = glob.glob(f"{document_folder}/*.pdf")

    all_documents = []
    total_pages = 0

    print(f"Found {len(pdf_files)} PDF files")

    for file_path in tqdm(pdf_files, desc='Loading Documents'):
        text, metadata = load_single_pdf(file_path)
        if text:
            all_documents.append({
                'text': text,
                'metadata': metadata
            })
            total_pages += metadata['total_pages']
    print(f"✅ Loaded {len(all_documents)} documents, {total_pages} total_pages")
    return all_documents

In [9]:
documents = load_all_documents("data/raw")

Found 4 PDF files


Loading Documents: 100%|██████████| 4/4 [01:17<00:00, 19.47s/it]

✅ Loaded 4 documents, 2578 total_pages





# Document Analysis Functions

In [10]:
def analyze_document_collection(documents):
    "Analyze your document collection"

    total_docs = len(documents)
    total_pages = sum(doc['metadata']['total_pages'] for doc in documents)
    total_chars = sum(len(doc['text']) for doc in documents)

    # Calculate averages
    avg_pages = total_pages / total_docs if total_docs > 0 else 0
    avg_chars = total_chars / total_docs if total_chars > 0 else 0

    print("📊 DOCUMENT COLLECTION ANALYSIS")
    print(f"Total Documents: {total_docs}")
    print(f"Total Pages: {total_pages}")
    print(f"Total Characters: {total_chars:,}")
    print(f"Average Pages per Doc: {avg_pages:,.1f}")
    print(f"Average Characters per Doc: {avg_chars:,.0f}")

    return {
        'total_docs': total_docs,
        'total_pages': total_pages,
        'total_chars': total_chars,
        'avg_pages': avg_pages,
        'avg_chars': avg_chars
    }

In [11]:
stats = analyze_document_collection(documents)

📊 DOCUMENT COLLECTION ANALYSIS
Total Documents: 4
Total Pages: 2578
Total Characters: 5,405,247
Average Pages per Doc: 644.5
Average Characters per Doc: 1,351,312


# Document Preview Function

In [12]:
def preview_document(document, preview_length = 500):
    "Preview first few characters of a document"

    filename = document['metadata']['filename']
    text = document['text']

    print(f"\n📄 DOCUMENT: {filename}")
    print(f"Pages: {document['metadata']['total_pages']}")
    print(f"Characters: {len(text):,}")
    print("\n--- Preview ---")
    print(text[:preview_length] + "..." if len(text) > preview_length else text)
    print("--- END ---\n")

In [13]:
for i in range(len(documents)):
    preview_document(documents[i])


📄 DOCUMENT: Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf
Pages: 758
Characters: 1,684,466

--- Preview ---

Information Science and Statistics
Series Editors:
M. Jordan
J. Kleinberg
B. Scho¨lkopf
Information Science and Statistics 
Akaike and Kitagawa: The Practice of Time Series Analysis. 
Bishop:  Pattern Recognition and Machine Learning. 
Cowell, Dawid, Lauritzen, and Spiegelhalter: Probabilistic Networks and
Expert Systems. 
Doucet, de Freitas, and Gordon: Sequential Monte Carlo Methods in Practice. 
Fine: Feedforward Neural Network Methodology. 
Hawkins and Olwell: Cumulative Sum Charts and Char...
--- END ---


📄 DOCUMENT: Deep Learning by Ian Goodfellow, Yoshua Bengio, Aaron Courville.pdf
Pages: 801
Characters: 1,769,202

--- Preview ---

Deep Learning
Ian Goodfellow
Yoshua Bengio
Aaron Courville
Contents
Website vii
Acknowledgments viii
Notation xi
1 Introduction 1
1.1 Who Should Read This Book? . . . . . . . . . . . . . . . . . . . . 8
1.2 Historical Trends in Dee

# Text Cleaning Functions

In [14]:
def clean_document_text(text):
    "Clean and normalize document text"

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove page number and header/footers
    text = re.sub(r'\n\d+\n', '\n', text)

    # Remove excessive newlines
    text = re.sub(r'\n+', '\n', text)

    # Remove special characters that causes issues
    text = text.replace('\x00', '')

    # Strip Leading/trailing whitespace
    text = text.strip()

    return text

In [15]:
for doc in documents:
    doc['text'] = clean_document_text(doc['text'])

print("✅ All document cleaned!")

✅ All document cleaned!


# Save Processed Documents

In [16]:
def save_processed_documents(documents, file_path = 'data/processed/processed_documents.pkl'):
    "Save processed documents for later use"
    with open(file_path, 'wb') as f:
        pickle.dump(documents, f)
    print(f"✅ Saved {len(documents)} documents to {file_path}")

In [17]:
def load_processed_documents(file_path = 'data/processed/processed_documents.pkl'):
    "Load previously processed documents"
    with open(file_path, 'rb') as f:
        documents = pickle.load(f)
    print(f"✅ Loaded {len(documents)} documents from {file_path}")
    return documents

In [18]:
save_processed_documents(documents)

✅ Saved 4 documents to data/processed/processed_documents.pkl


# NLTK Text Processing & Chunking

In [19]:
def create_intelligent_chunks(text, target_chunk_size=800, overlap_sentences=2):
    """Create chunks based on sentence boundaries using NLTK"""
    
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        sentence_length = len(sentence)
        
        # If adding this sentence exceeds target size, finalize current chunk
        if current_length + sentence_length > target_chunk_size and current_chunk:
            # Join sentences into chunk
            chunk_text = " ".join(current_chunk)
            chunks.append(chunk_text)
            
            # Start new chunk with overlap
            if len(current_chunk) > overlap_sentences:
                current_chunk = current_chunk[-overlap_sentences:]  # Keep last N sentences
                current_length = sum(len(s) for s in current_chunk)
            else:
                current_chunk = []
                current_length = 0
        
        # Add current sentence
        current_chunk.append(sentence)
        current_length += sentence_length
    
    # Add final chunk if exists
    if current_chunk:
        chunk_text = " ".join(current_chunk)
        chunks.append(chunk_text)
    
    return chunks

In [20]:
def process_all_documents_to_chunks(documents):
    """Convert all documents into chunks with metadata"""
    
    all_chunks = []
    chunk_metadata = []
    
    for doc_idx, document in enumerate(tqdm(documents, desc="Processing documents")):
        text = document['text']
        metadata = document['metadata']
        
        # Create chunks for this document
        chunks = create_intelligent_chunks(text)
        
        # Add each chunk with metadata
        for chunk_idx, chunk in enumerate(chunks):
            all_chunks.append(chunk)
            
            # Create metadata for this chunk
            chunk_meta = {
                'doc_index': doc_idx,
                'chunk_index': chunk_idx,
                'source_file': metadata['filename'],
                'chunk_length': len(chunk),
                'doc_total_pages': metadata['total_pages']
            }
            chunk_metadata.append(chunk_meta)
    
    print(f"✅ Created {len(all_chunks)} total chunks from {len(documents)} documents")
    return all_chunks, chunk_metadata

In [21]:
all_chunks, chunk_metadata = process_all_documents_to_chunks(documents)

Processing documents: 100%|██████████| 4/4 [00:00<00:00,  7.65it/s]

✅ Created 12561 total chunks from 4 documents





# Advanced text Processing

In [22]:
import string

In [23]:
def advanced_text_preprocessing(chunk):
    "Advanced preprocessing for better embeddings"

    # Convert to lowercase
    text = chunk.lower()

    # Remove excessive whitespace and newlines
    text = re.sub(r'\s+',' ', text)

    # Remove URLs and email address
    text = re.sub(r'http[s]?:\\(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F]))+', '', text)
    text = re.sub(r'\S+@\S+', '', text)

    # Remove excessive puctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove extra spaces agian
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [24]:
preprocessed_chunks = [advanced_text_preprocessing(chunk) for chunk in all_chunks]
preprocessed_chunks

['information science and statistics series editors m jordan j kleinberg b scho lkopf information science and statistics akaike and kitagawa the practice of time series analysis bishop pattern recognition and machine learning cowell dawid lauritzen and spiegelhalter probabilistic networks and expert systems doucet de freitas and gordon sequential monte carlo methods in practice fine feedforward neural network methodology hawkins and olwell cumulative sum charts and charting for quality improvement jensen bayesian networks and decision graphs marchette computer intrusion detection and network monitoring a statistical viewpoint rubinstein and kroese the cross entropy method a unified approach to combinatorial optimization monte carlo simulation and machine learning',
 'marchette computer intrusion detection and network monitoring a statistical viewpoint rubinstein and kroese the cross entropy method a unified approach to combinatorial optimization monte carlo simulation and machine learn

In [25]:
def analyze_chunks(chunks, metadata):
    """Analyze chunk quality and distribution"""
    
    chunk_lengths = [len(chunk) for chunk in chunks]
    
    print("📊 CHUNK ANALYSIS")
    print(f"Total Chunks: {len(chunks)}")
    print(f"Average Length: {np.mean(chunk_lengths):.0f} characters")
    print(f"Min Length: {min(chunk_lengths)} characters")
    print(f"Max Length: {max(chunk_lengths)} characters")
    print(f"Median Length: {np.median(chunk_lengths):.0f} characters")
    
    # Show distribution by document
    doc_chunk_counts = {}
    for meta in metadata:
        doc_name = meta['source_file']
        doc_chunk_counts[doc_name] = doc_chunk_counts.get(doc_name, 0) + 1
    
    print(f"\nTop documents by chunk count:")
    sorted_docs = sorted(doc_chunk_counts.items(), key=lambda x: x[1], reverse=True)
    for doc, count in sorted_docs[:5]:
        print(f"  {doc}: {count} chunks")

In [26]:
analyze_chunks(all_chunks, chunk_metadata)

📊 CHUNK ANALYSIS
Total Chunks: 12561
Average Length: 760 characters
Min Length: 53 characters
Max Length: 39826 characters
Median Length: 741 characters

Top documents by chunk count:
  Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf: 4281 chunks
  Deep Learning by Ian Goodfellow, Yoshua Bengio, Aaron Courville.pdf: 3981 chunks
  Hands_On_Machine_Learning_with_Scikit_Learn_and_TensorFlow.pdf: 2404 chunks
  understanding-machine-learning-theory-algorithms.pdf: 1895 chunks


In [27]:
def preview_chunks(chunks, metadata, num_chunks=3):
    """Preview some chunks to verify quality"""
    
    for i in range(min(num_chunks, len(chunks))):
        chunk = chunks[i]
        meta = metadata[i]
        
        print(f"\n--- CHUNK {i+1} ---")
        print(f"Source: {meta['source_file']}")
        print(f"Length: {len(chunk)} chars")
        print(f"Content: {chunk[:200]}...")
        print("--- END CHUNK ---")

In [28]:
preview_chunks(all_chunks, chunk_metadata)


--- CHUNK 1 ---
Source: Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf
Length: 802 chars
Content: Information Science and Statistics Series Editors: M. Jordan J. Kleinberg B. Scho¨lkopf Information Science and Statistics Akaike and Kitagawa: The Practice of Time Series Analysis. Bishop: Pattern Re...
--- END CHUNK ---

--- CHUNK 2 ---
Source: Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf
Length: 526 chars
Content: Marchette: Computer Intrusion Detection and Network Monitoring: A Statistical Viewpoint. Rubinstein and Kroese: The Cross-Entropy Method: A Unified Approach to Combinatorial Optimization, Monte Carlo ...
--- END CHUNK ---

--- CHUNK 3 ---
Source: Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf
Length: 777 chars
Content: Wallace: Statistical and Inductive Inference by Minimum Massage Length. Christopher M. Bishop Pattern Recognition and Machine Learning Christopher M. Bishop F.R.Eng. Assistant Director Microsoft Resea...
--- END CHUNK ---


# Save Chunks for Next Phase

In [29]:
def save_chunks_and_metadata(chunks, metadata, chunks_file = 'data/processed/chunks.pkl', metadata_file = 'data/processed/metadata.pkl'):
    "Save chunks and metadata for next phase"

    # Save chunks
    with open(chunks_file, 'wb') as f:
        pickle.dump(chunks, f)

    # Save metadata
    with open(metadata_file, 'wb') as f:
        pickle.dump(metadata, f)
    
    print(f"✅ Saved {len(chunks)} chunks and metadata")
    print(f"   Chunks: {chunks_file}")
    print(f"   Metadata: {metadata_file}")


In [30]:
# Save processed chunks
save_chunks_and_metadata(all_chunks, chunk_metadata)

✅ Saved 12561 chunks and metadata
   Chunks: data/processed/chunks.pkl
   Metadata: data/processed/metadata.pkl


# Embedding Generation and FAISS Database Creation

In [31]:
def initialize_gemini_embeddings():
    "Initialize GEMINI Embeddings model"
    try:
        embedding_model = GoogleGenerativeAIEmbeddings(
            model = 'models/embedding-001',
            google_api_key = GEMINI_API_KEY,
            output_dimensionality=1536
        )

        test_embeddings = embedding_model.embed_query("This is a test sentence.")
        embedding_dimension = len(test_embeddings)

        print(f"✅ Gemini embedding initialized !")
        print(f"Embedding dimension: {embedding_dimension}")
        return embedding_model, embedding_dimension
    except Exception as e:
        print(f"❌ Failed to initialize Gemini embeddings: {e}")
        return None, None

In [32]:
embedding_model, embedding_dim = initialize_gemini_embeddings()

✅ Gemini embedding initialized !
Embedding dimension: 768


# Batch Embedding Generation

In [33]:
def generate_embeddings_batch(chunks, embeddings_model, batch_size = 50):
    "Generate embeddings for chunks in batches"
    all_embeddings = []
    total_batches = (len(chunks) + batch_size - 1) // batch_size

    print(f"Generating embeddings for {len(chunks)} chunks in {total_batches} batches...")

    for i in tqdm(range(0, len(chunks), batch_size), desc = "Embedding batches"):
        batch_chunks = chunks[i:i + batch_size]

        try:
            # Generate embeddings for batch
            batch_embeddings = embeddings_model.embed_documents(batch_chunks)
            all_embeddings.extend(batch_embeddings)
        
        except Exception as e:
            print(f"Error in batch {i//batch_size + 1}: {e}")
            # Add empty Embeddings as placeholder
            all_embeddings.extend([None] * len(batch_chunks))
    print(f"✅ Generated {len(all_embeddings)} embeddings")
    return all_embeddings

In [34]:
chunk_embeddings = generate_embeddings_batch(all_chunks, embedding_model)

Generating embeddings for 12561 chunks in 252 batches...


Embedding batches:  67%|██████▋   | 169/252 [08:06<04:26,  3.21s/it]

Error in batch 170: 'utf-8' codec can't encode character '\ud835' in position 384: surrogates not allowed


Embedding batches:  75%|███████▌  | 190/252 [09:04<02:57,  2.86s/it]

Error in batch 191: 'utf-8' codec can't encode character '\ud835' in position 123: surrogates not allowed


Embedding batches:  76%|███████▌  | 192/252 [09:07<02:12,  2.21s/it]

Error in batch 193: 'utf-8' codec can't encode character '\ud835' in position 563: surrogates not allowed


Embedding batches:  84%|████████▍ | 212/252 [09:51<01:32,  2.32s/it]

Error in batch 213: 'utf-8' codec can't encode character '\ud835' in position 240: surrogates not allowed


Embedding batches: 100%|██████████| 252/252 [11:32<00:00,  2.75s/it]

✅ Generated 12561 embeddings





# FAISS Database Creation

In [35]:
def create_faiss_index(embeddings, embedding_dimensions):
    "Create and populate FAISS index"

    # Fileter out None embeddings (failed embeddings)
    valid_embeddings = [emb for emb in embeddings if emb is not None]
    valid_indices = [i for i, emb in enumerate(embeddings) if emb is not None]

    print(f"Creating FAISS index with {len(valid_embeddings)} valid embeddings")

    # Create FAISS index (Inner product for cosine similarity)
    index = faiss.IndexFlatIP(embedding_dimensions)

    # Convert to numpy index and normalize for cosine similarity
    embeddings_array = np.array(valid_embeddings).astype('float32')
    faiss.normalize_L2(embeddings_array)

    # Add embeddings to index
    index.add(embeddings_array)

    print(f"✅ FAISS index created with {index.ntotal} vectors")
    return index, valid_indices

In [36]:
faiss_index, valid_chunk_indices = create_faiss_index(chunk_embeddings, embedding_dim)

Creating FAISS index with 12361 valid embeddings
✅ FAISS index created with 12361 vectors


# Test Retrival System

In [37]:
def test_faiss_retrieval(query, faiss_index, embeddings_model, chunks, metadata, valid_indices, k=5):
    """Test FAISS retrieval with a query"""
    
    # Generate query embedding
    query_embedding = embeddings_model.embed_query(query)
    query_vector = np.array([query_embedding]).astype('float32')
    faiss.normalize_L2(query_vector)
    
    # Search FAISS index
    scores, indices = faiss_index.search(query_vector, k)
    
    print(f"\n🔍 QUERY: {query}")
    print(f"📊 Retrieved {len(indices[0])} chunks:\n")
    
    results = []
    for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
        if idx < len(valid_indices):
            original_idx = valid_indices[idx]
            chunk = chunks[original_idx]
            meta = metadata[original_idx]
            
            print(f"--- RESULT {i+1} (Score: {score:.3f}) ---")
            print(f"Source: {meta['source_file']}")
            print(f"Content: {chunk[:200]}...")
            print()
            
            results.append({
                'chunk': chunk,
                'metadata': meta,
                'score': score
            })
    
    return results

# Test retrieval with sample queries
test_queries = [
    "What is machine learning?",
    "How do neural networks work?",
    "What is backpropagation?",
    "Explain supervised learning"
]

for query in test_queries[:2]:  # Test first 2 queries
    results = test_faiss_retrieval(query, faiss_index, embedding_model, 
                                 all_chunks, chunk_metadata, valid_chunk_indices)


🔍 QUERY: What is machine learning?
📊 Retrieved 5 chunks:

--- RESULT 1 (Score: 0.745) ---
Source: Hands_On_Machine_Learning_with_Scikit_Learn_and_TensorFlow.pdf
Content: Machine Learning is the science (and art) of programming computers so they can learn from data. Here is a slightly more general definition: [Machine Learning is the] field of study that gives computer...

--- RESULT 2 (Score: 0.743) ---
Source: understanding-machine-learning-theory-algorithms.pdf
Content: We are surrounded by a machine learning based technology: search engines learn how to bring us the best results (while placing proﬁtable ads), anti-spam software learns to ﬁlter our email messages, an...

--- RESULT 3 (Score: 0.740) ---
Source: understanding-machine-learning-theory-algorithms.pdf
Content: In a sense, machine learning can be viewed as a branch of AI (Artiﬁcial Intelligence), since, after all, the ability to turn expe- rience into expertise or to detect meaningful patterns in complex sen...

--- RESULT

# Save FAISS Database

In [38]:
def save_faiss_database(faiss_index, chunks, metadata, valid_indices, base_path = 'data/faiss_index'):
    "Save FAISS index and associated data"

    # Create Directory
    Path(base_path).mkdir(parents = True, exist_ok = True)

    # Save FAISS index
    faiss.write_index(faiss_index, f"{base_path}/intellect_engine.index")

    # Save chunks and metadata
    with open(f"{base_path}/chunks.pkl", 'wb') as f:
        pickle.dump(chunks, f)
    
    with open(f"{base_path}/metadata.pkl", 'wb') as f:
        pickle.dump(metadata, f)
    
    with open(f"{base_path}/valid_indices.pkl", 'wb') as f:
        pickle.dump(valid_indices, f)

    # Save index info
    index_info = {
        'total_chunks': len(chunks),
        'valid_chunks': len(valid_indices),
        'embedding_dimension': faiss_index.d,
        'index_type': 'IndexFlatIP',
        'created_at': datetime.now().isoformat()
    }

    with open(f"{base_path}/index_info.json", 'w') as f:
        json.dump(index_info, f, indent = 2)
    
    print(f"✅ FAISS database saved to {base_path}")
    print(f"   Index file: intellect_engine.index")
    print(f"   Total Vectors: {faiss_index.ntotal}")

In [39]:
save_faiss_database(faiss_index, all_chunks, chunk_metadata, valid_chunk_indices)

✅ FAISS database saved to data/faiss_index
   Index file: intellect_engine.index
   Total Vectors: 12361
