<a href="https://colab.research.google.com/github/sebastianruizsebas/NeuroAI-HackNation/blob/main/mixtral_8x7b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import fitz  # PyMuPDF for PDFs
import openai
from typing import List
import faiss
import numpy as np

openai.api_key = "YOUR_OPENAI_API_KEY"  # Replace with our key

DATA_DIR = "./courses"
EMBEDDINGS_FILE = os.path.join(DATA_DIR, "external_embeddings.jsonl")
INDEX_FILE = "courses.index"
METADATA_FILE = "courses_metadata.json"
CHUNK_SIZE = 3000
CHUNK_OVERLAP = 200
EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_DIM = 1536  # ada-002 embeddings
TOP_K = 5  # no. chunks to retrieve per query

# Extraction Functions

def extract_text_from_pdf(pdf_path: str) -> str:
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Failed to extract {pdf_path}: {e}")
        return ""

def extract_text_from_html(html_path: str) -> str:
    try:
        with open(html_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Failed to extract {html_path}: {e}")
        return ""

# Chunking

def chunk_text(text: str, max_len=CHUNK_SIZE, overlap=CHUNK_OVERLAP) -> List[str]:
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_len, len(text))
        chunks.append(text[start:end])
        start += max_len - overlap
    return chunks

# Embedding

def get_embedding(text: str) -> List[float]:
    response = openai.Embedding.create(
        input=text,
        model=EMBEDDING_MODEL
    )
    return response['data'][0]['embedding']

# Processing raw files into embeddings

def process_and_embed_data(data_dir: str):
    entries = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            path = os.path.join(root, file)
            ext = file.lower().split('.')[-1]
            text = ""

            if ext == "pdf":
                text = extract_text_from_pdf(path)
            elif ext in ["html", "htm"]:
                text = extract_text_from_html(path)
            else:
                continue

            if not text.strip():
                continue

            chunks = chunk_text(text)
            print(f"Processing {file}: {len(chunks)} chunks")

            for i, chunk in enumerate(chunks):
                embedding = get_embedding(chunk)
                entry = {
                    "source_file": path,
                    "chunk_index": i,
                    "text": chunk,
                    "embedding": embedding
                }
                entries.append(entry)

    print(f"Total chunks embedded: {len(entries)}")

    with open(EMBEDDINGS_FILE, "w", encoding='utf-8') as f:
        for entry in entries:
            f.write(json.dumps(entry) + "\n")
    print(f"Saved embeddings to {EMBEDDINGS_FILE}")

# Build FAISS index from saved embeddings

def build_faiss_index(embeddings_file: str):
    index = faiss.IndexFlatIP(EMBEDDING_DIM)
    metadata = []

    with open(embeddings_file, "r", encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            vec = np.array(entry["embedding"], dtype='float32')
            index.add(vec.reshape(1, -1))
            metadata.append({
                "source_file": entry["source_file"],
                "chunk_index": entry["chunk_index"],
                "text": entry["text"]
            })

    faiss.write_index(index, INDEX_FILE)
    with open(METADATA_FILE, "w", encoding='utf-8') as f:
        json.dump(metadata, f)
    print(f"Saved FAISS index to {INDEX_FILE} and metadata to {METADATA_FILE}")

# Query / Retrieval pipeline

def embed_query(query: str) -> np.ndarray:
    response = openai.Embedding.create(
        input=query,
        model=EMBEDDING_MODEL
    )
    embedding = response['data'][0]['embedding']
    return np.array(embedding, dtype='float32').reshape(1, -1)

def load_index_and_metadata():
    index = faiss.read_index(INDEX_FILE)
    with open(METADATA_FILE, "r", encoding='utf-8') as f:
        metadata = json.load(f)
    return index, metadata

def search_index(query: str, top_k=TOP_K):
    index, metadata = load_index_and_metadata()
    query_vec = embed_query(query)
    distances, indices = index.search(query_vec, top_k)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if idx == -1:
            continue
        data = metadata[idx]
        results.append({
            "source_file": data["source_file"],
            "chunk_index": data["chunk_index"],
            "text": data["text"],
            "score": float(dist)
        })
    return results

# Example lesson content generation integrating retrieval

def generate_lesson_with_context(topic: str, user_profile: dict):
    # Retrieve relevant chunks from vector DB to provide context
    retrieved_chunks = search_index(topic, top_k=5)
    context_texts = "\n\n".join([chunk['text'] for chunk in retrieved_chunks])

    competency = user_profile.get('competency_scores', {}).get(topic, 0)

    prompt = f"""
You are a knowledgeable AI tutor.

Topic: {topic}

Context from course materials:
{context_texts}

Create a lesson for someone with competency level {competency}/10.
Structure:
1. Brief overview
2. Four main learning chunks (2-3 mins each)
3. Key takeaways

Return ONLY a JSON object like this:
{{
  "topic": "{topic}",
  "overview": "...",
  "chunks": [
    {{
      "title": "...",
      "content": "...",
      "key_point": "..."
    }}
  ],
  "key_takeaways": ["...", "..."]
}}
"""

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )
    content = response.choices[0].message.content.strip()

    # Clean possible markdown JSON block
    if content.startswith("```json"):
        content = content[7:-3].strip()
    elif content.startswith("```"):
        content = content[3:-3].strip()

    lesson = json.loads(content)
    return lesson

# Main entry point for full pipeline

if __name__ == "__main__":
    # Step 1: Extract + embed all course files (run once or when updating data)
    print("Starting data processing and embedding...")
    process_and_embed_data(DATA_DIR)

    # Step 2: Build FAISS index from saved embeddings
    print("Building FAISS index...")
    build_faiss_index(EMBEDDINGS_FILE)

    # Step 3: Example interactive query (replace with API integration)
    while True:
        user_topic = input("\nEnter your learning topic or question (or 'exit'): ")
        if user_topic.lower() == 'exit':
            break

        # Dummy user profile example; replace with real user data
        user_profile = {"competency_scores": {user_topic: 3}}

        print("Generating lesson based on your topic...")
        lesson = generate_lesson_with_context(user_topic, user_profile)
        print(json.dumps(lesson, indent=2))
