###Part 1
This section covers everything you need to set up your environment, process your PDF files, and load them into a Chroma vector database

In [None]:
!pip install -U transformers accelerate bitsandbytes -q
!pip install -U pypdf -q
!pip install -U chromadb -q
!pip install -U sentence-transformers -q
!pip install -U langchain -q

import os
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer

In [None]:
# ==============================================================================
# SECTION 1: SETUP, PDF PROCESSING, AND VECTOR DATABASE POPULATION
# ==============================================================================
# This script follows Phases 1 and 2 of the project roadmap.
# It handles:
# 1. Installation of required libraries.
# 2. Setting up a directory for your PDF files.
# 3. Processing the PDFs: loading, chunking text.
# 4. Initializing ChromaDB.
# 5. Generating vector embeddings and storing them in ChromaDB.
# ==============================================================================

# --- 1.2 Set Up Data Directory ---
# Creates a directory to store PDF files
PDF_DIRECTORY = "/content/hdm_pdfs/"
if not os.path.exists(PDF_DIRECTORY):
    os.makedirs(PDF_DIRECTORY)
    print(f"Directory created: {PDF_DIRECTORY}")
else:
    print(f"Directory already exists: {PDF_DIRECTORY}")


In [None]:
# --- 2.2 Process PDFs ---
def process_all_pdfs(directory_path):
    """
    Loads all PDF files from a directory, extracts text, and splits it into chunks.

    Args:
        directory_path (str): The path to the directory containing PDF files.

    Returns:
        list: A list of text chunks (documents).
        list: A list of corresponding metadata for each chunk.
    """
    all_chunks = []
    all_metadata = []


    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=150   # The number of characters to overlap between chunks
    )

    pdf_files = [f for f in os.listdir(directory_path) if f.endswith(".pdf")]

    if not pdf_files:
        print(f"No PDF files found in {directory_path}.")
        return [], []

    print(f"Found {len(pdf_files)} PDF(s).")

    for pdf_file in pdf_files:
        try:
            file_path = os.path.join(directory_path, pdf_file)

            # Read the PDF
            reader = PdfReader(file_path)
            pdf_text = ""
            for page in reader.pages:
                pdf_text += page.extract_text() or ""

            # Split the text into chunks
            chunks = text_splitter.split_text(pdf_text)

            # Create metadata for each chunk
            for i, chunk in enumerate(chunks):
                all_chunks.append(chunk)
                all_metadata.append({'source': pdf_file, 'chunk_id': i})

        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

    print(f"\n PDF processing complete. Total chunks created: {len(all_chunks)}")
    return all_chunks, all_metadata

# NOTE: Make sure you have uploaded your PDFs before running this cell.
documents, metadatas = process_all_pdfs(PDF_DIRECTORY)


In [None]:
# --- 2.1 & 2.4 Set Up ChromaDB and Index Data ---
def setup_and_populate_chromadb(docs, metadata_list):
    """
    Initializes ChromaDB, generates embeddings for documents, and populates the database.

    Args:
        docs (list): The list of text chunks.
        metadata_list (list): The list of metadata corresponding to the chunks.
    """
    if not docs:
        print("No documents to process for ChromaDB.")
        return None

    print("\n--- Setting up ChromaDB and Populating ---")

    # Initialize ChromaDB client. Using a persistent client to save data in the Colab environment.
    client = chromadb.PersistentClient(path="/content/chroma_db")

    # --- 2.3 Generate Embeddings ---
    print("Loading sentence-transformer model 'all-MiniLM-L6-v2'...")
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    embeddings = embedding_model.encode(docs, show_progress_bar=True)

    # Create a ChromaDB collection
    # Using get_or_create to avoid errors if the collection already exists
    collection_name = "hdm_collection"
    collection = client.get_or_create_collection(name=collection_name)

    # --- 2.4 Index Data in Chroma ---
    # We need to create unique IDs for each entry.
    ids = [f"doc_{i}" for i in range(len(docs))]

    collection.add(
        embeddings=embeddings,
        documents=docs,
        metadatas=metadata_list,
        ids=ids
    )

    # Verify the number of items in the collection
    count = collection.count()
    print(f"\n ChromaDB setup complete. Collection '{collection_name}' now contains {count} items.")
    return collection

# Execute the ChromaDB setup and population
if documents:
    hdm_collection = setup_and_populate_chromadb(documents, metadatas)
else:
    print("\nSkipping ChromaDB setup because no documents were processed.")

##SECTION 2: LLM INTEGRATION AND RAG CORE

This script follows Phase 3 of the project roadmap.
 It handles:
 1. Installation of the 'transformers' library for the LLM.
 2. Loading the Qwen-3 model and its tokenizer.
 3. Building the core RAG logic to query the database and generate answers.

In [None]:
# --- 1.1 Install Additional Libraries ---
# We need the transformers library from Hugging Face to run the Qwen model,
# as well as accelerate and bitsandbytes for memory optimization.
#!pip install -U transformers accelerate -q
#pip install -U bitsandbytes # Added this line to ensure the latest version is installed

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import chromadb
from sentence_transformers import SentenceTransformer

# --- 3.1 Set Up Qwen-3 LLM ---
# We will load the model and tokenizer from Hugging Face.

model_name = "Qwen/Qwen1.5-1.8B-Chat"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# device_map="auto" will automatically use the GPU if available.
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    quantization_config=quantization_config # Pass the config object here
)
print("model loaded successfully.")

In [None]:
# --- Re-initialize necessary components from Part 1 ---
# This ensures that if you run this cell independently, it can reconnect
# to the database and reload the embedding model.

try:
    # Connect to the existing ChromaDB client
    client = chromadb.PersistentClient(path="/content/chroma_db")
    hdm_collection = client.get_collection(name="hdm_collection")

    # Load the embedding model
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

except Exception as e:
    print(f"Error re-initializing: {e}")
    print("ensure you have run Part 1 successfully before this cell.")

In [None]:
# --- 3.2 Build the RAG Pipeline ---
def ask_hdm_expert(query, k=5):
    """
    Performs the full RAG pipeline: embeds query, retrieves context, and generates an answer.

    Args:
        query (str): The user's question about the HDM model.
        k (int): The number of relevant chunks to retrieve from the database.

    Returns:
        str: The generated answer from the LLM.
    """
    if 'hdm_collection' not in globals() or 'embedding_model' not in globals():
        return "Error: ChromaDB collection or embedding model not initialized. Please run Part 1."

    # 1. & 2. Embed the user's query
    query_embedding = embedding_model.encode(query)

    # 3. Similarity Search in ChromaDB
    results = hdm_collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=k,
        include=['documents', 'metadatas'] # Ensure metadata is included
    )
    retrieved_docs = results['documents'][0]
    retrieved_metadatas = results['metadatas'][0]

    # Format the retrieved documents into a single context string
    context = "\n\n---\n\n".join(retrieved_docs)

    # 4. Context Augmentation: Create the prompt for the LLM

    # MAKE SURE TO CHANGE TEMPLATE BACK TO HDM CONTEXT!!!                                   <-----------------------------------------
    prompt_template = f"""
You are an expert research assistant specializing in american football.
Answer the following question based *only* on the provided context in the vector database.
If the context provided does not explicitly contain the answer, state that you cannot answer based on the provided information.

**Context:**
{context}

**Question:**
{query}

**Answer:**
"""

    # 5. LLM Generation
    print("Generating an answer...")

    # Use the chat template for better performance with chat-tuned models
    # REMEMBER TO CHANGE BACK TO HDM CONTEXT!!!!!                                          <----------------------------------------------
    messages = [
        {"role": "system", "content": "You are a helpful expert on american football."},
        {"role": "user", "content": prompt_template}
    ]

    # Set the pad_token to the eos_token if it's not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # FIX: create the formatted prompt string from the chat template.
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    # FIX: tokenize the formatted string to get the dictionary of inputs.
    model_inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt"
    ).to(model.device)

    # Now model_inputs is a dictionary and can be correctly unpacked with **
    outputs = model.generate(
        **model_inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.95
    )

    # Adjust the decoding to correctly slice the output tensor
    response_text = tokenizer.decode(outputs[0][model_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    return {"answer": response_text, "sources": retrieved_metadatas}


In [None]:
# --- Example Usage ---
example_query = "What are the different ways i can score points in football?"
result = ask_hdm_expert(example_query)

print("\n" + "="*50)
print(f"Query: {example_query}")
print("\n" + "-"*50)
print(f"Answer: {result['answer']}")
print("\n" + "-"*50)
print("Sources Used by the Model:")
for i, source in enumerate(result['sources']):
    print(f"  Source {i+1}: From '{source['source']}' (Chunk ID: {source['chunk_id']})")
print("="*50)