In [5]:
pip install fitz

Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install frontend

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pymupdf

Collecting pymupdf
  Obtaining dependency information for pymupdf from https://files.pythonhosted.org/packages/ae/76/0757056bdcf273de4934681b84acde6e0b61a46b1755038e8d786ac6b368/pymupdf-1.25.4-cp39-abi3-macosx_11_0_arm64.whl.metadata
  Downloading pymupdf-1.25.4-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-macosx_11_0_arm64.whl (18.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.4
Note: you may need to restart the kernel to use updated packages.


In [12]:
#4300 Notes is where the PDFs are stored, processed_texts is where txt files are saved to
PDF_FOLDER = "4300 Notes"
OUTPUT_FOLDER = "processed_texts"

#Adjust chunks and chunk size here
CHUNK_SIZE = 50  
OVERLAP_SIZE = 30  

In [13]:
import os
import fitz

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

#Extracting the text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

#Remove extra spaces, newlines and non-ASCII
def clean_text(text): 
    text = text.replace("\n", " ").replace("\t", " ") 
    text = " ".join(text.split())  
    return text

#Split text into chunks with overlap
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def process_pdfs():
    for filename in os.listdir(PDF_FOLDER):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(PDF_FOLDER, filename)
            text = extract_text_from_pdf(pdf_path)
            text = clean_text(text) 
            chunks = chunk_text(text, chunk_size=CHUNK_SIZE, overlap=OVERLAP_SIZE) 
            
            #Save chunks as separate file
            for idx, chunk in enumerate(chunks):
                chunk_filename = f"{filename.replace('.pdf', '')}_chunk{idx}.txt"
                output_path = os.path.join(OUTPUT_FOLDER, chunk_filename)
                
                with open(output_path, "w", encoding="utf-8") as f:
                    f.write(chunk)
                
            print(f"Processed {filename} → {len(chunks)} chunks saved.")

process_pdfs()

Processed B-trees.pdf → 42 chunks saved.
Processed Document DBs and Mongo.pdf → 55 chunks saved.
Processed NoSQL Documentation.pdf → 67 chunks saved.
Processed AWS Intro.pdf → 39 chunks saved.
Processed MongoDB Documentation.pdf → 85 chunks saved.
Processed BST_hw.pdf → 30 chunks saved.
Processed EC2 & Lambda.pdf → 29 chunks saved.
Processed MongoDB Examples.pdf → 36 chunks saved.
Processed Neo4j.pdf → 33 chunks saved.
Processed MongoDB Aggregation.pdf → 16 chunks saved.
Processed B-Trees — CS3 Data Structures & Algorithms.pdf → 180 chunks saved.
Processed Redis + Python.pdf → 22 chunks saved.
Processed Introduction to Graph Data Model.pdf → 30 chunks saved.
Processed BST.pdf → 92 chunks saved.
Processed Data Replication.pdf → 47 chunks saved.
Processed NoSQL Intro + KV DBs.pdf → 89 chunks saved.
Processed B+Tree Walkthrough.pdf → 22 chunks saved.
Processed Foundations.pdf → 41 chunks saved.
Processed Moving Beyond the Relational Model.pdf → 60 chunks saved.
Processed PyMongo.pdf → 9 c

# USING TEST TRANSFORMER TO TEST QUERIES FOR CHUNK SIZE OPTIMIZATION

In [14]:
pip install sentence-transformers scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [15]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load your text chunks (from processed_texts folder)
OUTPUT_FOLDER = "processed_texts"  # Folder containing the chunks
chunks = []

for filename in os.listdir(OUTPUT_FOLDER):
    if filename.endswith(".txt"):
        with open(os.path.join(OUTPUT_FOLDER, filename), 'r', encoding='utf-8') as f:
            chunks.append(f.read())

# Use a sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all text chunks
chunk_embeddings = model.encode(chunks, convert_to_tensor=True)

# Function to run queries and get the most similar chunks
def run_query(query):
    # Encode the query into a vector
    query_embedding = model.encode([query], convert_to_tensor=True)
    
    # Compute cosine similarity between the query and the text chunks
    similarities = cosine_similarity(query_embedding.cpu().detach().numpy(), chunk_embeddings.cpu().detach().numpy())
    
    # Get the index of the most similar chunk
    most_similar_idx = np.argmax(similarities)
    most_similar_chunk = chunks[most_similar_idx]
    
    print(f"Most similar chunk for query '{query}':\n")
    print(most_similar_chunk)
    print(f"\nSimilarity Score: {similarities[0][most_similar_idx]}")
    
# Example query
run_query("How to write a NoSQL Query")

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [8]:
import ollama
import redis
import numpy as np
import os
from redis.commands.search.query import Query
from transformers import AutoModel

# Initialize Redis connection
redis_client = redis.Redis(host="localhost", port=6379, db=0)

# set constants
VECTOR_DIM = 768
INDEX_NAME = "embedding_index"
DOC_PREFIX = "doc:"
DISTANCE_METRIC = "COSINE"
TEXT_FOLDER = "processed_texts"  
selected_model = None
jina_model = None
selected_llm_model = None

#clear redis database if reindexing
def create_hnsw_index():
    try:
        redis_client.execute_command(f"FT.DROPINDEX {INDEX_NAME} DD")
    except redis.exceptions.ResponseError:
        pass
    
    redis_client.execute_command(
        f"""
        FT.CREATE {INDEX_NAME} ON HASH PREFIX 1 {DOC_PREFIX}
        SCHEMA text TEXT
        embedding VECTOR HNSW 6 DIM {VECTOR_DIM} TYPE FLOAT32 DISTANCE_METRIC {DISTANCE_METRIC}
        """
    )
    print("Index created successfully.")



def get_embedding(text: str) -> list:
    """
    Generate an embedding for the given text using the selected embedding model.

    This function uses either the Jina embeddings model or the Ollama embeddings
    model based on the global EMBEDDING_MODEL setting.

    Parameters:
    text (str): The input text to be embedded.

    Returns:
    list: A list of floats representing the embedding vector for the input text.
    """
    if EMBEDDING_MODEL == "jina-embeddings-v2-base-en":
        return jina_model.encode([text])[0].tolist()
    else:
        response = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text)
        return response["embedding"]


def store_embedding(doc_id: str, text: str, embedding: list):
    key = f"{DOC_PREFIX}{doc_id}"
    redis_client.hset(
        key,
        mapping={
            "text": text,
            "embedding": np.array(embedding, dtype=np.float32).tobytes(),  # Store as byte array
        },
    )
    print(f"Stored embedding for: {doc_id}")

def process_text_files():
    """
    This function processes all text files in the specified folder, reads their content,
    generates embeddings for the text using the selected embedding model, and stores the
    embeddings along with the text content in Redis.

    """
    if not os.path.exists(TEXT_FOLDER):
        print(f"Folder '{TEXT_FOLDER}' not found.")
        return

    text_files = [f for f in os.listdir(TEXT_FOLDER) if f.endswith(".txt")]
    if not text_files:
        print("No text files found.")
        return

    for filename in text_files:
        filepath = os.path.join(TEXT_FOLDER, filename)
        with open(filepath, "r", encoding="utf-8") as file:
            text = file.read()
            embedding = get_embedding(text)
            store_embedding(filename, text, embedding)

def query_llm(query: str, matching_chunks: list) -> str:
    """
    Query the Language Model (LLM) with a given question and relevant context.

    This function prepares a prompt by combining the user's query and relevant context
    from matching chunks. It then sends this prompt to the LLM for processing and returns
    the model's response.

    Parameters:
    query (str): The user's question or input to be answered by the LLM.
    matching_chunks (list): A list of text chunks that provide relevant context for the query.
    """
    context = "\n\n".join([f"Chunk {i+1}: {chunk}" for i, chunk in enumerate(matching_chunks)])
    prompt_to_send = (
        f"User's Question: {query}\n\n"
        f"Relevant Context (if applicable):\n{context}\n\n"
        "Your task: Answer the user's question as clearly and accurately as possible."
        "If the question is unclear or not actually a question, state that explicitly."
    )
    response = ollama.chat(
        model=LLM_MODEL,
        messages=[
            {"role": "system", "content": "You are an AI assistant with expertise in computer science."},
            {"role": "user", "content": prompt_to_send}
        ],
    )
    return response["message"]["content"]

def perform_knn_search(query_text: str, k: int = 2):
    """
    Perform a K-Nearest Neighbors (KNN) search on the Redis index using the given query text.

    This function embeds the query text, searches for similar embeddings in the Redis index,
    retrieves matching text chunks, and generates a response using a language model.

    Parameters:
    query_text (str): The text query to search for in the index.
    k (int, optional): The number of nearest neighbors to retrieve. Defaults to 2.

    """
    embedding = get_embedding(query_text)
    q = (
        Query(f"*=>[KNN {k} @embedding $vec AS vector_distance]")
        .sort_by("vector_distance")
        .return_fields("text", "vector_distance")
        .dialect(2)
    )
    res = redis_client.ft(INDEX_NAME).search(
        q, query_params={"vec": np.array(embedding, dtype=np.float32).tobytes()}
    )
    matching_chunks = [doc.text for doc in res.docs]
    if not matching_chunks:
        print("No relevant matches found.")
        return
    print(f"\nTop {len(matching_chunks)} matching chunks retrieved:")
    for i, chunk in enumerate(matching_chunks):
        print(f"\nChunk {i+1}: {chunk[:300]}...")  # Display first 300 characters
    response = query_llm(query_text, matching_chunks)
    print(f"\nResponse from {LLM_MODEL}:\n{response}\n")


# Prompt user to select an embedding model
embedding_models = {
    "1": "nomic-embed-text",
    "2": "jina-embeddings-v2-base-en",
    "3": "granite-embedding:278m",
}

print("Select an embedding model:")
for key, model in embedding_models.items():
    print(f"{key}: {model}")

while selected_model not in embedding_models:
    selected_model = input("Enter the number corresponding to your choice: ")

EMBEDDING_MODEL = embedding_models[selected_model]

# If Jina embeddings are selected, load the model
if EMBEDDING_MODEL == "jina-embeddings-v2-base-en":
    jina_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-en", trust_remote_code=True)

# Prompt user to select an LLM model
llm_models = {
    "1": "llama3.2:latest",
    "2": "mistral",
}

print("Select an LLM model:")
for key, model in llm_models.items():
    print(f"{key}: {model}")

while selected_llm_model not in llm_models:
    selected_llm_model = input("Enter the number corresponding to your choice: ")

LLM_MODEL = llm_models[selected_llm_model]
print(f"Using LLM model: {LLM_MODEL}")

if __name__ == "__main__":
    create_hnsw_index()
    process_text_files()
    query = "What is redis?"
    perform_knn_search(query)


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Select an embedding model:
1: nomic-embed-text
2: jina-embeddings-v2-base-en
3: granite-embedding:278m
Enter the number corresponding to your choice: 1
Select an LLM model:
1: llama3.2:latest
2: mistral
Enter the number corresponding to your choice: 1
Using LLM model: llama3.2:latest


ResponseError: unknown command 'FT.CREATE', with args beginning with: 'embedding_index' 'ON' 'HASH' 'PREFIX' '1' 'doc:' 'SCHEMA' 'text' 'TEXT' 'embedding' 'VECTOR' 'HNSW' '6' 'DIM' '768' 'TYPE' 'FLOA' 

In [7]:
pip install transformers

Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl (10.2 MB)
     --------------------------------------- 10.2/10.2 MB 22.5 MB/s eta 0:00:00
Collecting safetensors>=0.4.3
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl (308 kB)
     ------------------------------------- 308.9/308.9 kB 18.7 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.26.0
  Downloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
     ------------------------------------- 469.0/469.0 kB 28.7 MB/s eta 0:00:00
Collecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl (2.4 MB)
     ---------------------------------------- 2.4/2.4 MB 51.6 MB/s eta 0:00:00
Collecting fsspec>=2023.5.0
  Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
     ---------------------------------------- 193.6/193.6 kB ? eta 0:00:00
Installing collected packages: safetensors, fsspec, huggingface-hub, tokenizers, transformers
  Attempting uninstall: fsspec
    Found exist