1. Extract PDFs

In [7]:
import fitz  # PyMuPDF
import os

def extract_text_from_pdf(pdf_path, start_page, end_page):
    """Extracts text from a given PDF file within a specific page range."""
    doc = fitz.open(pdf_path)
    total_pages = len(doc)  # Count total pages
    print(f"📌 Extracting from {pdf_path} ({total_pages} pages) | Pages {start_page} to {end_page}")

    full_text = []
    for page_num in range(start_page - 1, min(end_page, total_pages)):  # Adjusting for 0-based index
        page = doc[page_num]
        text = page.get_text("text")
        if text.strip():  # Only count non-empty pages
            full_text.append(text)
        else:
            print(f"⚠️ Empty text extracted from page {page_num + 1}")

    extracted_text = "\n".join(full_text)
    print(f"✅ Extracted {len(extracted_text)} characters from {pdf_path}\n")
    return extracted_text

# Define PDFs and their required page ranges
pdf_folder = "cs101-pdfs/"
documents = {}

pdf_configs = {
    "C++.pdf": (22, 803),
    "The_C++_Programming_Language_4th_Edition_Bjarne_Stroustrup.pdf": (23, 1278)
}

for pdf_file, (start_page, end_page) in pdf_configs.items():
    pdf_path = os.path.join(pdf_folder, pdf_file)
    if os.path.exists(pdf_path):
        text = extract_text_from_pdf(pdf_path, start_page, end_page)
        documents[pdf_file] = text
    else:
        print(f"❌ File not found: {pdf_path}")

# You can now process `documents` as needed, e.g., save to text files or store in a database.


📌 Extracting from cs101-pdfs/C++.pdf (846 pages) | Pages 22 to 803
✅ Extracted 965353 characters from cs101-pdfs/C++.pdf

📌 Extracting from cs101-pdfs/The_C++_Programming_Language_4th_Edition_Bjarne_Stroustrup.pdf (1366 pages) | Pages 23 to 1278
✅ Extracted 2509426 characters from cs101-pdfs/The_C++_Programming_Language_4th_Edition_Bjarne_Stroustrup.pdf



2. Clean and Preprocess

In [25]:
import re

def clean_text(text):
    """Removes everything before the first chapter title and cleans the text."""
    # Find the first occurrence of a chapter heading
    match = re.search(r'(?i)(chapter\s*\d+|1[. ]\s+|CHAPTER ONE|CHAPTER 1|1[.]\d+)', text)

    if match:
        start_index = match.start()  # Get where the first chapter starts
        text = text[start_index:]  # Keep only content after the match

    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# Apply new cleaning function
for pdf_name, content in documents.items():
    documents[pdf_name] = clean_text(content)

# Debug: Check cleaned text size
for pdf_name, content in documents.items():
    print(f"📌 After Cleaning: {len(content)} characters from {pdf_name}")


📌 After Cleaning: 1579 characters from The_C++_Programming_Language_4th_Edition_Bjarne_Stroustrup.pdf
📌 After Cleaning: 6221 characters from C++.pdf


3. Chunking

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken  # For accurate token-based chunking

# Load OpenAI's tokenizer for GPT-4 to ensure we stay within limits
tokenizer = tiktoken.get_encoding("cl100k_base")  # GPT-4 tokenizer

# Pinecone max vector size (keeping it safe under 4MB)
MAX_TOKENS_PER_CHUNK = 3000  # Adjust based on your model's token limit

def chunk_text_safely(text, chunk_size=MAX_TOKENS_PER_CHUNK, overlap=200):
    """Splits text into smaller chunks while keeping token count safe for Pinecone."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,  
        separators=["\n\n", "\n", " ", "."],  # Keeps natural breaks
        length_function=lambda x: len(tokenizer.encode(x))  # Count tokens instead of characters
    )
    return text_splitter.split_text(text)

# Apply chunking to all cleaned documents
chunked_data = []
for pdf_name, content in documents.items():
    chunks = chunk_text_safely(content)
    print(f"✅ {len(chunks)} chunks created for {pdf_name}")  # Debugging output
    for chunk in chunks:
        chunked_data.append({
            "text": chunk,
            "source": pdf_name  # Metadata for retrieval
        })

print(f"\n📌 Total Chunks Created: {len(chunked_data)}")


AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



4. Embeddings

In [9]:
import requests
import os
import json
import time
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

# Load Hugging Face API key from environment
hf_api_key = os.getenv("HF_API_KEY")

# Define Hugging Face Inference API URL (Correct Endpoint for Feature Extraction)
API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"
headers = {"Authorization": f"Bearer {hf_api_key}"}

def get_hf_embedding(texts):
    """Fetches embeddings for a batch of text chunks from Hugging Face Inference API."""
    payload = {"inputs": texts}
    
    try:
        response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
        response.raise_for_status()
        
        embeddings = response.json()
        
        if isinstance(embeddings, list) and all(isinstance(i, list) for i in embeddings):
            return embeddings  # Return list of embeddings
        else:
            print("❌ API returned unexpected format:", embeddings)
            return None
    
    except requests.exceptions.RequestException as e:
        print(f"❌ Request failed: {e}")
        return None

def process_embeddings(chunked_data, batch_size=30):
    """Processes embeddings in batches synchronously with a progress bar."""
    total_batches = len(chunked_data) // batch_size + (1 if len(chunked_data) % batch_size != 0 else 0)
    
    with tqdm(total=total_batches, desc="Processing Embeddings", unit="batch") as pbar:
        for i in range(0, len(chunked_data), batch_size):
            batch = chunked_data[i:i+batch_size]
            texts = [data["text"] for data in batch]
            
            embeddings = get_hf_embedding(texts)
            
            if embeddings:
                for idx, embedding in enumerate(embeddings):
                    chunked_data[i + idx]["embedding"] = embedding
            
            pbar.update(1)  # Update progress bar
            
            # Save progress every batch
            save_progress(chunked_data)
            
            # Respect Hugging Face rate limits (adjust if needed)
            time.sleep(1.5)  # Small delay to prevent rate limit issues

    print("\n✅ Hugging Face embeddings generated for all chunks!")

def save_progress(chunked_data):
    """Saves current progress to a JSON file."""
    with open("embeddings_progress.json", "w") as f:
        json.dump(chunked_data, f, indent=4)
    print("💾 Progress saved to embeddings_progress.json")

# Run the embedding extraction
if __name__ == "__main__":
    process_embeddings(chunked_data)


Processing Embeddings:   8%|▊         | 1/12 [00:11<02:10, 11.85s/batch]

💾 Progress saved to embeddings_progress.json


Processing Embeddings:  17%|█▋        | 2/12 [00:21<01:43, 10.32s/batch]

💾 Progress saved to embeddings_progress.json


Processing Embeddings:  25%|██▌       | 3/12 [00:30<01:28,  9.89s/batch]

💾 Progress saved to embeddings_progress.json


Processing Embeddings:  33%|███▎      | 4/12 [00:33<00:58,  7.35s/batch]

❌ Request failed: 503 Server Error: Service Unavailable for url: https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2
💾 Progress saved to embeddings_progress.json


Processing Embeddings:  42%|████▏     | 5/12 [00:39<00:47,  6.83s/batch]

❌ Request failed: 503 Server Error: Service Unavailable for url: https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2
💾 Progress saved to embeddings_progress.json


Processing Embeddings:  50%|█████     | 6/12 [00:44<00:36,  6.03s/batch]

❌ Request failed: 503 Server Error: Service Unavailable for url: https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2
💾 Progress saved to embeddings_progress.json


Processing Embeddings:  58%|█████▊    | 7/12 [00:56<00:40,  8.20s/batch]

💾 Progress saved to embeddings_progress.json


Processing Embeddings:  67%|██████▋   | 8/12 [01:03<00:31,  7.79s/batch]

💾 Progress saved to embeddings_progress.json


Processing Embeddings:  75%|███████▌  | 9/12 [01:14<00:26,  8.79s/batch]

💾 Progress saved to embeddings_progress.json


Processing Embeddings:  83%|████████▎ | 10/12 [01:31<00:22, 11.06s/batch]

💾 Progress saved to embeddings_progress.json


Processing Embeddings:  92%|█████████▏| 11/12 [01:40<00:10, 10.44s/batch]

💾 Progress saved to embeddings_progress.json


Processing Embeddings: 100%|██████████| 12/12 [01:42<00:00,  8.11s/batch]

💾 Progress saved to embeddings_progress.json


Processing Embeddings: 100%|██████████| 12/12 [01:44<00:00,  8.70s/batch]


✅ Hugging Face embeddings generated for all chunks!





5. Store to DB

In [21]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

# Load environment variables
load_dotenv()

# Retrieve Pinecone API key from environment variables
api_key = os.getenv("PINECONE_API_KEY")
if not api_key:
    raise ValueError("PINECONE_API_KEY is not set in the environment variables.")

# Initialize the Pinecone client
pc = Pinecone(api_key=api_key)

6. Create and Connect

In [22]:
# Define index parameters
index_name = "cs101-rag"
dimension = 384  # Dimensionality of MiniLM embeddings

# Check if the index already exists
if not pc.has_index(index_name):
    # Create a new index
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",  # Using cosine similarity
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

# Connect to the index
index = pc.Index(index_name)

7. Upsert data

In [16]:
import json
import time
from tqdm import tqdm

PROGRESS_FILE = "embeddings_progress.json"

def load_embeddings():
    """Loads saved embeddings from progress file."""
    if not os.path.exists(PROGRESS_FILE):
        raise FileNotFoundError(f"❌ {PROGRESS_FILE} not found. Run embedding extraction first.")
    
    with open(PROGRESS_FILE, "r") as f:
        return json.load(f)

def upsert_embeddings(batch_size=100):
    """Upserts embeddings into Pinecone in batches."""
    embeddings_data = load_embeddings()
    upserted_count = 0

    total_batches = len(embeddings_data) // batch_size + (1 if len(embeddings_data) % batch_size != 0 else 0)

    with tqdm(total=total_batches, desc="Upserting into Pinecone", unit="batch") as pbar:
        for i in range(0, len(embeddings_data), batch_size):
            batch = embeddings_data[i:i + batch_size]

            # Filter out entries without embeddings
            valid_vectors = [
                {
                    "id": str(hash(data["text"])),  # Unique ID for each text
                    "values": data["embedding"],  # Embedding vector
                    "metadata": {
                        "source": data.get("source", "unknown"),  # Default to "unknown" if missing
                        "text": data["text"]
                    }
                }
                for data in batch if "embedding" in data and isinstance(data["embedding"], list)
            ]

            if not valid_vectors:
                pbar.update(1)
                continue
            
            # Upsert with retries
            success = upsert_with_retries(valid_vectors)
            if success:
                upserted_count += len(valid_vectors)

            pbar.update(1)  # Update progress bar
            time.sleep(0.5)  # Prevent hitting rate limits

    print(f"\n✅ Successfully upserted {upserted_count} vectors into Pinecone!")

def upsert_with_retries(vectors, retries=3):
    """Attempts to upsert into Pinecone with retry logic."""
    for attempt in range(retries):
        try:
            index.upsert(vectors=vectors)
            return True  # Success
        except Exception as e:
            print(f"⚠️ Upsert attempt {attempt + 1}/{retries} failed: {e}")
            time.sleep(2)  # Small delay before retrying
    
    print("❌ Upsert failed after all retries.")
    return False

# Run upsertion
if __name__ == "__main__":
    upsert_embeddings()


Upserting into Pinecone: 100%|██████████| 4/4 [00:07<00:00,  1.79s/batch]


✅ Successfully upserted 241 vectors into Pinecone!





8. Verify Data

In [25]:
import random
import os
from dotenv import load_dotenv
from pinecone import Pinecone

# Load environment variables
load_dotenv()

# Retrieve Pinecone API key from environment variables
pinecone_api = os.getenv("PINECONE_API_KEY")

# Initialize the Pinecone client
pc = Pinecone(api_key=pinecone_api)

# Connect to your index
index = pc.Index('cs101-rag')

def list_vector_count():
    """Lists the total number of vectors in the index."""
    stats = index.describe_index_stats()
    total_vectors = stats['total_vector_count']
    print(f"Total number of vectors in the index: {total_vectors}")

def fetch_random_vector():
    """Fetches a random vector ID from Pinecone to verify data integrity."""
    # List all vector IDs (this may need to be adjusted for large datasets)
    vector_ids = []
    for ids in index.list():
        vector_ids.extend(ids)
    
    if not vector_ids:
        print("No vectors found in the index.")
        return

    # Select a random vector ID
    random_id = random.choice(vector_ids)
    
    # Fetch the vector data
    vector_data = index.fetch(ids=[random_id])
    
    if vector_data and random_id in vector_data.vectors:
        vector_info = vector_data.vectors[random_id]
        print("\n✅ Sample Vector Retrieved:")
        print(f"ID: {random_id}")
        print(f"Values: {vector_info.values}")
        print(f"Metadata: {vector_info.metadata}")
    else:
        print("Vector not found.")

# Run verification
list_vector_count()
fetch_random_vector()


Total number of vectors in the index: 241

✅ Sample Vector Retrieved:
ID: 4668887154771494313
Values: [-0.0561828651, 0.039083492, 0.0347332954, -0.0338819, -0.0035442498, -0.0463199802, -0.0232972614, 0.0666727796, -0.0610934347, -0.0591209084, 0.040451929, 0.0116449352, -0.0427011177, 0.0234538037, -0.0651997477, -0.023923818, -0.055158861, 0.0596844964, 0.0120901512, -0.0279157441, 0.0945512578, 0.0292616468, -0.0812079385, 0.0168211386, 0.0416575447, -0.000626907102, -0.0589060634, 0.00426082546, 0.0620645173, -0.0142065203, 0.0886903778, 0.00592241203, 0.096780315, 0.0287935808, -0.00703592459, 0.0132592367, -0.0142796757, -0.0604704805, -0.0362280495, 0.0713307485, 0.048421666, 0.0144479871, -0.00939351134, 0.0382677913, 0.0035734491, 0.0347920842, -0.0164370481, -0.0183145572, -0.0890184864, 0.0489395373, -0.0500382781, 0.0920795947, -0.0782784373, 0.0781494603, 0.071398966, -0.0265639517, -0.0230601523, -0.0621126033, 0.0348485708, 0.0069211293, -0.00854103081, -0.033824306, 0.