In [5]:
pip install fitz

Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install frontend

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pymupdf

Collecting pymupdf
  Obtaining dependency information for pymupdf from https://files.pythonhosted.org/packages/ae/76/0757056bdcf273de4934681b84acde6e0b61a46b1755038e8d786ac6b368/pymupdf-1.25.4-cp39-abi3-macosx_11_0_arm64.whl.metadata
  Downloading pymupdf-1.25.4-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-macosx_11_0_arm64.whl (18.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.4
Note: you may need to restart the kernel to use updated packages.


In [12]:
#4300 Notes is where the PDFs are stored, processed_texts is where txt files are saved to
PDF_FOLDER = "onboarding Documents"
OUTPUT_FOLDER = "processed_texts"

#Adjust chunks and chunk size here
CHUNK_SIZE = 50  
OVERLAP_SIZE = 30  

In [13]:
import os
import fitz

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

#Extracting the text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

#Remove extra spaces, newlines and non-ASCII
def clean_text(text): 
    text = text.replace("\n", " ").replace("\t", " ") 
    text = " ".join(text.split())  
    return text

#Split text into chunks with overlap
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def process_pdfs():
    for filename in os.listdir(PDF_FOLDER):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(PDF_FOLDER, filename)
            text = extract_text_from_pdf(pdf_path)
            text = clean_text(text) 
            chunks = chunk_text(text, chunk_size=CHUNK_SIZE, overlap=OVERLAP_SIZE) 
            
            #Save chunks as separate file
            for idx, chunk in enumerate(chunks):
                chunk_filename = f"{filename.replace('.pdf', '')}_chunk{idx}.txt"
                output_path = os.path.join(OUTPUT_FOLDER, chunk_filename)
                
                with open(output_path, "w", encoding="utf-8") as f:
                    f.write(chunk)
                
            print(f"Processed {filename} → {len(chunks)} chunks saved.")

process_pdfs()

Processed B-trees.pdf → 42 chunks saved.
Processed Document DBs and Mongo.pdf → 55 chunks saved.
Processed NoSQL Documentation.pdf → 67 chunks saved.
Processed AWS Intro.pdf → 39 chunks saved.
Processed MongoDB Documentation.pdf → 85 chunks saved.
Processed BST_hw.pdf → 30 chunks saved.
Processed EC2 & Lambda.pdf → 29 chunks saved.
Processed MongoDB Examples.pdf → 36 chunks saved.
Processed Neo4j.pdf → 33 chunks saved.
Processed MongoDB Aggregation.pdf → 16 chunks saved.
Processed B-Trees — CS3 Data Structures & Algorithms.pdf → 180 chunks saved.
Processed Redis + Python.pdf → 22 chunks saved.
Processed Introduction to Graph Data Model.pdf → 30 chunks saved.
Processed BST.pdf → 92 chunks saved.
Processed Data Replication.pdf → 47 chunks saved.
Processed NoSQL Intro + KV DBs.pdf → 89 chunks saved.
Processed B+Tree Walkthrough.pdf → 22 chunks saved.
Processed Foundations.pdf → 41 chunks saved.
Processed Moving Beyond the Relational Model.pdf → 60 chunks saved.
Processed PyMongo.pdf → 9 c

# USING TEST TRANSFORMER TO TEST QUERIES FOR CHUNK SIZE OPTIMIZATION

In [14]:
pip install sentence-transformers scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [15]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load your text chunks (from processed_texts folder)
OUTPUT_FOLDER = "processed_texts"  # Folder containing the chunks
chunks = []

for filename in os.listdir(OUTPUT_FOLDER):
    if filename.endswith(".txt"):
        with open(os.path.join(OUTPUT_FOLDER, filename), 'r', encoding='utf-8') as f:
            chunks.append(f.read())

# Use a sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all text chunks
chunk_embeddings = model.encode(chunks, convert_to_tensor=True)

# Function to run queries and get the most similar chunks
def run_query(query):
    # Encode the query into a vector
    query_embedding = model.encode([query], convert_to_tensor=True)
    
    # Compute cosine similarity between the query and the text chunks
    similarities = cosine_similarity(query_embedding.cpu().detach().numpy(), chunk_embeddings.cpu().detach().numpy())
    
    # Get the index of the most similar chunk
    most_similar_idx = np.argmax(similarities)
    most_similar_chunk = chunks[most_similar_idx]
    
    print(f"Most similar chunk for query '{query}':\n")
    print(most_similar_chunk)
    print(f"\nSimilarity Score: {similarities[0][most_similar_idx]}")
    
# Example query
run_query("How to write a NoSQL Query")

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.