# Import Libraries and Datasets

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

from langchain_google_genai import GoogleGenerativeAI
import google.generativeai  as genai

import faiss
import numpy as np

from sentence_transformers import SentenceTransformer

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import os
import pickle
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv

import glob
from datetime import datetime

import re

## API KEY SETUP

In [2]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

## NLTK Downloads

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Deep\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Deep\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Deep\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Deep\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Deep\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
def create_project_directories():
    directories = [
        'data/raw',
        'data/processed',
        'data/faiss_index',
        'logs',
        'config'
    ]

    for directory in directories:
        Path(directory).mkdir(parents = True, exist_ok = True)
    print("✅ Project directories created !")

create_project_directories()

✅ Project directories created !


In [5]:
def validate_setup():
    # Test Gemini
    try:
        model = genai.GenerativeModel('gemini-2.5-pro')
        response = model.generate_content("Test")
        print("✅ Gemini Working")
    except:
        print("❌ Gemini Failed")
    
    # Test NLTK
    try:
        sent_tokenize("Test sentence.")
        print("✅ NLTK Working")
    except:
        print("❌ NLTK Failed")
    
    # Test FAISS
    try:
        index = faiss.IndexFlatIP(384)
        print("✅ FAISS working")
    except:
        print("❌ FAISS Failed")

In [6]:
validate_setup()

✅ Gemini Working
✅ NLTK Working
✅ FAISS working


# Document Loading & Initial Processing

In [7]:
def load_single_pdf(file_path):
    "Load a single PDF"
    try:
        loader = PyPDFLoader(file_path)
        pages = loader.load()

        full_text = '\n'.join([page.page_content for page in pages])

        # Extract
        metadata = {
            'filename': os.path.basename(file_path),
            'file_path': file_path,
            'total_pages': len(pages),
            'total_chars': len(full_text)
        }

        return full_text, metadata
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None, None

In [8]:
def load_all_documents(document_folder):
    "Load all PDFs from a folder"
    pdf_files = glob.glob(f"{document_folder}/*.pdf")

    all_documents = []
    total_pages = 0

    print(f"Found {len(pdf_files)} PDF files")

    for file_path in tqdm(pdf_files, desc='Loading Documents'):
        text, metadata = load_single_pdf(file_path)
        if text:
            all_documents.append({
                'text': text,
                'metadata': metadata
            })
            total_pages += metadata['total_pages']
    print(f"✅ Loaded {len(all_documents)} documents, {total_pages} total_pages")
    return all_documents

In [9]:
documents = load_all_documents("data/raw")

Found 4 PDF files


Loading Documents: 100%|██████████| 4/4 [02:07<00:00, 31.81s/it]

✅ Loaded 4 documents, 2578 total_pages





# Document Analysis Functions

In [10]:
def analyze_document_collection(documents):
    "Analyze your document collection"

    total_docs = len(documents)
    total_pages = sum(doc['metadata']['total_pages'] for doc in documents)
    total_chars = sum(len(doc['text']) for doc in documents)

    # Calculate averages
    avg_pages = total_pages / total_docs if total_docs > 0 else 0
    avg_chars = total_chars / total_docs if total_chars > 0 else 0

    print("📊 DOCUMENT COLLECTION ANALYSIS")
    print(f"Total Documents: {total_docs}")
    print(f"Total Pages: {total_pages}")
    print(f"Total Characters: {total_chars:,}")
    print(f"Average Pages per Doc: {avg_pages:,.1f}")
    print(f"Average Characters per Doc: {avg_chars:,.0f}")

    return {
        'total_docs': total_docs,
        'total_pages': total_pages,
        'total_chars': total_chars,
        'avg_pages': avg_pages,
        'avg_chars': avg_chars
    }

In [11]:
stats = analyze_document_collection(documents)

📊 DOCUMENT COLLECTION ANALYSIS
Total Documents: 4
Total Pages: 2578
Total Characters: 5,405,247
Average Pages per Doc: 644.5
Average Characters per Doc: 1,351,312


# Document Preview Function

In [12]:
def preview_document(document, preview_length = 500):
    "Preview first few characters of a document"

    filename = document['metadata']['filename']
    text = document['text']

    print(f"\n📄 DOCUMENT: {filename}")
    print(f"Pages: {document['metadata']['total_pages']}")
    print(f"Characters: {len(text):,}")
    print("\n--- Preview ---")
    print(text[:preview_length] + "..." if len(text) > preview_length else text)
    print("--- END ---\n")

In [13]:
for i in range(len(documents)):
    preview_document(documents[i])


📄 DOCUMENT: Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf
Pages: 758
Characters: 1,684,466

--- Preview ---

Information Science and Statistics
Series Editors:
M. Jordan
J. Kleinberg
B. Scho¨lkopf
Information Science and Statistics 
Akaike and Kitagawa: The Practice of Time Series Analysis. 
Bishop:  Pattern Recognition and Machine Learning. 
Cowell, Dawid, Lauritzen, and Spiegelhalter: Probabilistic Networks and
Expert Systems. 
Doucet, de Freitas, and Gordon: Sequential Monte Carlo Methods in Practice. 
Fine: Feedforward Neural Network Methodology. 
Hawkins and Olwell: Cumulative Sum Charts and Char...
--- END ---


📄 DOCUMENT: Deep Learning by Ian Goodfellow, Yoshua Bengio, Aaron Courville.pdf
Pages: 801
Characters: 1,769,202

--- Preview ---

Deep Learning
Ian Goodfellow
Yoshua Bengio
Aaron Courville
Contents
Website vii
Acknowledgments viii
Notation xi
1 Introduction 1
1.1 Who Should Read This Book? . . . . . . . . . . . . . . . . . . . . 8
1.2 Historical Trends in Dee

# Text Cleaning Functions

In [14]:
def clean_document_text(text):
    "Clean and normalize document text"

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove page number and header/footers
    text = re.sub(r'\n\d+\n', '\n', text)

    # Remove excessive newlines
    text = re.sub(r'\n+', '\n', text)

    # Remove special characters that causes issues
    text = text.replace('\x00', '')

    # Strip Leading/trailing whitespace
    text = text.strip()

    return text

In [15]:
for doc in documents:
    doc['text'] = clean_document_text(doc['text'])

print("✅ All document cleaned!")

✅ All document cleaned!


# Save Processed Documents

In [16]:
def save_processed_documents(documents, file_path = 'data/processed/processed_documents.pkl'):
    "Save processed documents for later use"
    with open(file_path, 'wb') as f:
        pickle.dump(documents, f)
    print(f"✅ Saved {len(documents)} documents to {file_path}")

In [17]:
def load_processed_documents(file_path = 'data/processed/processed_documents.pkl'):
    "Load previously processed documents"
    with open(file_path, 'rb') as f:
        documents = pickle.load(f)
    print(f"✅ Loaded {len(documents)} documents from {file_path}")
    return documents

In [18]:
save_processed_documents(documents)

✅ Saved 4 documents to data/processed/processed_documents.pkl
