In [None]:
hf_token=""
PASSKEY = ""  # Change this to your secure passkey
NGROK_TOKEN = ""  # Replace with your actual ngrok token

In [None]:
# Base packages
!pip install -q transformers accelerate huggingface_hub torch flask flask_cors pyngrok
!pip install -q langchain langchain-community sentence-transformers pypdf python-docx
!pip install -q faiss-cpu ddgs beautifulsoup4 requests lxml pypdf2


In [None]:
from flask import Flask, request, jsonify, Response, stream_with_context
from flask_cors import CORS
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from huggingface_hub import login
from pyngrok import ngrok
import torch
import threading
import base64
import io
import os
import json
from typing import List, Dict, Tuple, Optional

# Document processing
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import PyPDF2
from docx import Document

# Web search
from ddgs import DDGS
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
import time



In [None]:
login(token=hf_token)

In [None]:
print("‚è≥ Loading embeddings model...")
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)
print("‚úÖ Embeddings model loaded!")

print("‚è≥ Loading Mistral model...")
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)
print("‚úÖ Mistral model loaded successfully!")


In [None]:
# Storage
vector_stores = {}
episodic_memory = {}  # NEW: session_id -> list of {query, summary} objects
USE_GPU = False
gpu_resources = None

try:
    if torch.cuda.is_available():
        try:
            import faiss
            test_res = faiss.StandardGpuResources()
            test_index = faiss.IndexFlatL2(128)
            gpu_test = faiss.index_cpu_to_gpu(test_res, 0, test_index)
            USE_GPU = True
            gpu_resources = test_res
            print(f"‚úÖ FAISS-GPU enabled on {torch.cuda.get_device_name(0)}")
        except:
            print("‚ö†Ô∏è FAISS GPU not available, using CPU")
except Exception as e:
    print(f"‚ö†Ô∏è GPU check failed: {str(e)}")


In [None]:
# ============================================
# UTILITY FUNCTIONS
# ============================================

def llm_generate(prompt: str, max_tokens: int = 512, temperature: float = 0.1) -> str:
    """Generate text using Mistral model"""
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            do_sample=True,
            top_p=0.95,
            repetition_penalty=1.2
        )

    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()

def extract_text_from_txt(file_content):
    try:
        return file_content.decode('utf-8')
    except:
        return file_content.decode('latin-1')

def extract_text_from_pdf(file_content):
    pdf_file = io.BytesIO(file_content)
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(file_content):
    doc_file = io.BytesIO(file_content)
    doc = Document(doc_file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def process_document(file_content, file_type):
    if file_type == 'txt':
        return extract_text_from_txt(file_content)
    elif file_type == 'pdf':
        return extract_text_from_pdf(file_content)
    elif file_type in ['docx', 'doc']:
        return extract_text_from_docx(file_content)
    else:
        raise ValueError(f"Unsupported file type: {file_type}")

def create_vector_store(text, session_id):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = text_splitter.split_text(text)
    print(f"üìÑ Created {len(chunks)} chunks")

    vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings_model)
    vector_stores[session_id] = vectorstore
    return len(chunks)

def retrieve_relevant_chunks(query, session_id, k=3):
    if session_id not in vector_stores:
        return []
    vectorstore = vector_stores[session_id]
    docs = vectorstore.similarity_search(query, k=k)
    return [doc.page_content for doc in docs]


In [None]:
# ============================================
# EPISODIC MEMORY SYSTEM (NEW!)
# ============================================

def init_episodic_memory(session_id: str):
    """Initialize episodic memory for session"""
    global episodic_memory
    if session_id not in episodic_memory:
        episodic_memory[session_id] = []
        print(f"üß† Initialized episodic memory for {session_id}")

def add_to_episodic_memory(session_id: str, query: str, response: str):
    """Add query-response summary to episodic memory"""
    global episodic_memory
    init_episodic_memory(session_id)

    # Generate 3-line summary of response
    summary_prompt = f"""Summarize this response in exactly 3 short lines (max 50 words total).

Query: {query}

Response: {response[:500]}

Write 3 concise lines:"""

    try:
        summary = llm_generate(summary_prompt, max_tokens=80, temperature=0.2)

        # Clean up summary
        lines = [line.strip() for line in summary.split('\n') if line.strip()]
        summary = ' '.join(lines[:3])  # Take first 3 lines

        # Add to memory
        memory_entry = {
            "query": query,
            "summary": summary
        }

        episodic_memory[session_id].append(memory_entry)

        # Keep only last 15 entries to prevent memory overflow
        if len(episodic_memory[session_id]) > 15:
            episodic_memory[session_id] = episodic_memory[session_id][-15:]

        print(f"üíæ Added to episodic memory: '{query[:30]}...' ‚Üí '{summary[:50]}...'")

    except Exception as e:
        print(f"‚ö†Ô∏è Failed to add episodic memory: {str(e)}")

def get_episodic_memory(session_id: str) -> str:
    """Get formatted episodic memory for prompt"""
    global episodic_memory
    init_episodic_memory(session_id)

    if not episodic_memory[session_id]:
        return "No previous conversation history."

    # Format memory as readable text
    memory_text = "Previous Conversation Summary:\n"
    for i, entry in enumerate(episodic_memory[session_id], 1):
        memory_text += f"{i}. Q: {entry['query']}\n   A: {entry['summary']}\n\n"

    return memory_text.strip()

def get_last_query(session_id: str) -> str:
    """Get last query from episodic memory for context-aware rewriting"""
    global episodic_memory
    init_episodic_memory(session_id)

    if episodic_memory[session_id]:
        return episodic_memory[session_id][-1]['query']
    return ""

def clear_episodic_memory(session_id: str):
    """Clear episodic memory for session"""
    global episodic_memory
    if session_id in episodic_memory:
        episodic_memory[session_id] = []
        print(f"üóëÔ∏è Cleared episodic memory for {session_id}")


In [None]:
# ============================================
# WEB SEARCH FUNCTIONS
# ============================================

def search_web(query, num_results=5):
    try:
        print(f"üîç Searching DuckDuckGo for: {query}")
        urls = []
        with DDGS() as ddgs:
            for r in ddgs.text(query, max_results=num_results):
                url = r.get('href') or r.get('url')
                if url and url.startswith('http'):
                    urls.append(url)
                if len(urls) >= num_results:
                    break
        print(f"‚úÖ Found {len(urls)} URLs")
        return urls
    except Exception as e:
        print(f"‚ùå DuckDuckGo search error: {str(e)}")
        return []

def scrape_website(url, timeout=10):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')

        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe']):
            element.decompose()

        text = soup.get_text(separator='\n', strip=True)
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        text = '\n'.join(lines)

        if len(text) > 8000:
            text = text[:8000]

        print(f"‚úÖ Scraped {len(text)} characters from {urlparse(url).netloc}")
        return text
    except Exception as e:
        print(f"‚ö†Ô∏è Error scraping {url[:40]}: {str(e)}")
        return ""

def search_and_scrape(query, num_sites=3):
    print(f"\n{'='*60}\nüåê Web Search: {query}\n{'='*60}\n")
    urls = search_web(query, num_results=num_sites * 2)

    if not urls:
        return None, "No search results found."

    combined_text = f"# Web Results: {query}\n\n"
    successful_scrapes = 0

    for url in urls[:num_sites * 2]:
        if successful_scrapes >= num_sites:
            break

        text = scrape_website(url, timeout=15)
        if text and len(text) > 200:
            domain = urlparse(url).netloc
            combined_text += f"\nSource {successful_scrapes + 1}: {domain}\n{text}\n\n"
            successful_scrapes += 1

        time.sleep(1.5)

    if successful_scrapes == 0:
        return None, "Failed to scrape websites."

    print(f"‚úÖ Scraped {successful_scrapes} sites")
    return combined_text, None


In [None]:
# ============================================
# ADVANCED MODE: CAG + SELF-RAG + AGENTIC RAG + EPISODIC MEMORY
# ============================================

def classify_intent(episodic_mem: str, query: str) -> str:
    """Classify user query intent using episodic memory"""

    # Check if query is casual/simple
    casual_patterns = [
        'how are you', 'hey', 'hello', 'hi ', 'thanks', 'thank you',
        'good morning', 'good night', 'bye', 'okay', 'ok', 'nice',
        'great', 'cool', 'awesome', 'fine', 'alright'
    ]

    query_lower = query.lower().strip()
    if any(pattern in query_lower for pattern in casual_patterns) or len(query.split()) <= 3:
        print(f"üéØ Intent: casual (no retrieval needed)")
        return "casual"

    prompt = f"""Analyze the query and classify intent.

{episodic_mem if episodic_mem != "No previous conversation history." else "No previous conversation."}

Current query: "{query}"

Classify the query as ONE word:
- casual: Greetings, thanks, simple yes/no
- new_question: Completely new topic needing information
- follow_up: Continues current topic
- clarification: Asks to explain previous answer differently

Classification:"""

    intent = llm_generate(prompt, max_tokens=5, temperature=0.05).strip().lower()

    valid_intents = ["casual", "follow_up", "new_question", "clarification"]
    for valid in valid_intents:
        if valid in intent:
            print(f"üéØ Intent: {valid}")
            return valid

    print(f"üéØ Default: new_question")
    return "new_question"

def rewrite_query_with_context(query: str, last_query: str) -> str:
    """Improved Self-RAG: Rewrite vague queries using last query context"""

    # Check if query is vague (contains pronouns, "it", "that", etc.)
    vague_indicators = ['it', 'this', 'that', 'these', 'those', 'what', 'how', 'why', 'where']
    is_vague = any(word in query.lower().split()[:3] for word in vague_indicators) and len(query.split()) < 6

    if not is_vague or not last_query:
        # Query is clear, just add keywords
        prompt = f"""Add 1-2 relevant keywords to this search query. Keep it concise.

Query: "{query}"

Enhanced query:"""
        rewritten = llm_generate(prompt, max_tokens=20, temperature=0.1).strip()
    else:
        # Query is vague, use context from last query
        prompt = f"""Rewrite this vague query using context from the previous question.

Previous query: "{last_query}"

Current vague query: "{query}"

Rewrite as a clear, specific search query combining both contexts:"""
        rewritten = llm_generate(prompt, max_tokens=30, temperature=0.1).strip()

    # Clean and validate
    rewritten = rewritten.replace('"', '').replace("'", "").strip()

    # If rewrite fails or is too different, use original
    if len(rewritten) < 3 or len(rewritten) > len(query) * 4:
        print(f"üìù Using original: '{query}'")
        return query

    print(f"üìù Rewritten: '{query}' ‚Üí '{rewritten}'")
    return rewritten

def decide_retrieval_strategy(intent: str) -> Dict:
    """Agentic RAG: Decide retrieval strategy - smarter about when to retrieve"""
    if intent == "casual":
        return {
            "action": "no_retrieve",
            "doc_k": 0,
            "web_k": 0,
            "use_web": False,
            "reason": "Casual chat - no retrieval needed"
        }
    elif intent == "new_question":
        return {
            "action": "retrieve",
            "doc_k": 3,
            "web_k": 3,
            "use_web": True,
            "reason": "New topic - full retrieval"
        }
    elif intent == "follow_up":
        return {
            "action": "retrieve_constrained",
            "doc_k": 2,
            "web_k": 2,
            "use_web": True,
            "reason": "Follow-up - focused retrieval"
        }
    elif intent == "clarification":
        return {
            "action": "no_retrieve",
            "doc_k": 0,
            "web_k": 0,
            "use_web": False,
            "reason": "Clarification - memory only"
        }
    else:
        return {
            "action": "retrieve",
            "doc_k": 3,
            "web_k": 3,
            "use_web": True,
            "reason": "Default strategy"
        }

def retrieve_with_advanced_strategy(
    query: str,
    session_id: str,
    strategy: Dict
) -> Tuple[List[str], List[str]]:
    """Advanced retrieval: doc + web chunks (no raw history)"""

    doc_chunks = []
    web_chunks = []

    if strategy["action"] == "no_retrieve":
        print("üö´ Skipping retrieval (clarification mode)")
        return doc_chunks, web_chunks

    # Document Retrieval
    if session_id in vector_stores and strategy["doc_k"] > 0:
        doc_chunks = retrieve_relevant_chunks(query, session_id, k=strategy["doc_k"])
        if doc_chunks:
            print(f"üìö Retrieved {len(doc_chunks)} document chunks")

    # Web Retrieval
    if strategy["use_web"] and strategy["web_k"] > 0:
        print("üåê Performing web search...")
        web_text, error = search_and_scrape(query, num_sites=2)

        if web_text and not error:
            web_session_id = f"web_adv_{session_id}_{abs(hash(query))}"
            create_vector_store(web_text, web_session_id)
            web_chunks = retrieve_relevant_chunks(query, web_session_id, k=strategy["web_k"])
            if web_chunks:
                print(f"üåê Retrieved {len(web_chunks)} web chunks")

    return doc_chunks, web_chunks

def build_prompt_with_episodic_memory(
    episodic_mem: str,
    query: str,
    doc_chunks: List[str],
    web_chunks: List[str],
    intent: str
) -> str:
    """Build prompt using episodic memory - smart about using retrieved data"""

    # For casual queries, only use memory
    if intent == "casual":
        return f"""You are a friendly AI assistant. Respond naturally to casual conversation.

{episodic_mem if episodic_mem != "No previous conversation history." else ""}

User: {query}

Respond in a natural, conversational way (1-2 sentences):"""

    # For clarification, use memory only
    if intent == "clarification":
        # Extract last 2 interactions for context
        if episodic_mem != "No previous conversation history.":
            lines = episodic_mem.split('\n')
            relevant = [l for l in lines if l.strip() and not l.startswith('Previous')]
            if len(relevant) > 4:
                recent_context = '\n'.join(relevant[-4:])
            else:
                recent_context = episodic_mem
        else:
            recent_context = episodic_mem

        return f"""Clarify or rephrase the previous explanation.

{recent_context}

Question: {query}

Provide a clearer explanation:"""

    # For questions needing information, intelligently use available sources
    sources_available = []
    if doc_chunks:
        sources_available.append("document")
    if web_chunks:
        sources_available.append("web")

    # Build context sections only if data is available
    context_parts = []

    if doc_chunks:
        doc_text = "\n".join([f"- {chunk[:250]}" for chunk in doc_chunks])
        context_parts.append(f"Document Information:\n{doc_text}")

    if web_chunks:
        web_text = "\n".join([f"- {chunk[:250]}" for chunk in web_chunks])
        context_parts.append(f"Web Information:\n{web_text}")

    # Add memory for continuity
    memory_section = ""
    if episodic_mem != "No previous conversation history.":
        # Use only last 2 interactions for short-term context
        lines = episodic_mem.split('\n')
        relevant = [l for l in lines if l.strip() and not l.startswith('Previous')]
        if len(relevant) > 4:
            memory_section = "Recent Context:\n" + '\n'.join(relevant[-4:])

    # Build final prompt
    prompt_parts = ["Answer the question accurately and concisely."]

    if memory_section:
        prompt_parts.append(memory_section)

    if context_parts:
        prompt_parts.append("\n".join(context_parts))
        prompt_parts.append(f"\nQuestion: {query}")
        prompt_parts.append("\nInstructions:\n- Use only the relevant information provided\n- Don't mix unrelated sources\n- Be direct and concise\n- If info insufficient, state clearly")
    else:
        prompt_parts.append(f"\nQuestion: {query}")
        prompt_parts.append("\nAnswer based on your knowledge:")

    prompt_parts.append("\nAnswer:")

    return "\n\n".join(prompt_parts)


In [None]:
# ============================================
# STREAMING GENERATION
# ============================================

def generate_stream(prompt: str):
    """Streaming generation"""
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    def run_generation():
        with torch.inference_mode():
            model.generate(
                **inputs,
                max_new_tokens=600,
                use_cache=True,
                streamer=streamer,
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
                repetition_penalty=1.2
            )

    thread = threading.Thread(target=run_generation)
    thread.start()

    for token in streamer:
        yield token

def episodic_memory_streamer(generator, session_id: str, user_query: str):
    """Stream and update episodic memory after completion"""
    buffer = ""
    for token in generator:
        buffer += token
        yield token

    # Add to episodic memory after response complete
    if buffer.strip():
        add_to_episodic_memory(session_id, user_query, buffer)


In [None]:
# ============================================
# FLASK APP
# ============================================

app = Flask(__name__)
CORS(app)

@app.route("/upload-document", methods=["POST", "OPTIONS"])
def upload_document():
    if request.method == "OPTIONS":
        return jsonify({"status": "ok"}), 200

    try:
        data = request.get_json()

        if data.get("passkey") != PASSKEY:
            return jsonify({"error": "Unauthorized"}), 403

        file_base64 = data.get("file_content")
        file_name = data.get("file_name")
        session_id = data.get("session_id", "default")

        if not file_base64 or not file_name:
            return jsonify({"error": "File content and name required"}), 400

        file_content = base64.b64decode(file_base64)
        file_type = file_name.split('.')[-1].lower()

        print(f"üì• Processing {file_name}")
        text = process_document(file_content, file_type)

        if not text.strip():
            return jsonify({"error": "No text extracted"}), 400

        print(f"üìù Extracted {len(text)} characters")
        num_chunks = create_vector_store(text, session_id)

        return jsonify({
            "message": "Document processed",
            "num_chunks": num_chunks,
            "session_id": session_id
        }), 200

    except Exception as e:
        print(f"Error: {str(e)}")
        return jsonify({"error": str(e)}), 500

@app.route("/query", methods=["POST", "OPTIONS"])
def query_model():
    if request.method == "OPTIONS":
        return jsonify({"status": "ok"}), 200

    try:
        data = request.get_json()

        if data.get("passkey") != PASSKEY:
            return jsonify({"error": "Unauthorized"}), 403

        query = data.get("query", "")
        mode = data.get("mode", "model")
        session_id = data.get("session_id", "default")

        if not query:
            return jsonify({"error": "Query missing"}), 400

        print(f"\n{'='*60}")
        print(f"üîç Query: {query}")
        print(f"üìã Mode: {mode}")
        print(f"{'='*60}\n")

        # Initialize episodic memory
        init_episodic_memory(session_id)

        # Get episodic memory
        episodic_mem = get_episodic_memory(session_id)

        # ADVANCED MODE
        if mode == "advanced":
            print("üöÄ ADVANCED MODE (Episodic Memory)")

            # Classify intent
            intent = classify_intent(episodic_mem, query)

            # Get last query for context
            last_query = get_last_query(session_id)

            # Rewrite query with context
            search_query = query
            if intent == "new_question" or intent == "follow_up":
                search_query = rewrite_query_with_context(query, last_query)

            # Get retrieval strategy
            strategy = decide_retrieval_strategy(intent)
            print(f"üéØ Strategy: {strategy['reason']}")

            # Retrieve
            doc_chunks, web_chunks = retrieve_with_advanced_strategy(
                search_query,
                session_id,
                strategy
            )

            # Build prompt
            prompt = build_prompt_with_episodic_memory(
                episodic_mem,
                query,
                doc_chunks,
                web_chunks,
                intent
            )

            # Stream response
            return Response(
                stream_with_context(
                    episodic_memory_streamer(
                        generate_stream(prompt),
                        session_id,
                        query
                    )
                ),
                mimetype='text/event-stream',
                headers={'Cache-Control': 'no-cache', 'X-Accel-Buffering': 'no'}
            )

        # DOCUMENT MODE
        elif mode == "document":
            context_chunks = retrieve_relevant_chunks(query, session_id, k=3)

            if not context_chunks:
                return jsonify({"error": "No document uploaded"}), 400

            context = "\n\n".join(context_chunks)
            print(f"üìö Retrieved {len(context_chunks)} chunks")

            # Build prompt with episodic memory
            prompt = f"""Answer using the provided information.

{episodic_mem}

Document Context:
{context}

Question: {query}

Answer:"""

            return Response(
                stream_with_context(
                    episodic_memory_streamer(generate_stream(prompt), session_id, query)
                ),
                mimetype='text/event-stream',
                headers={'Cache-Control': 'no-cache', 'X-Accel-Buffering': 'no'}
            )

        # WEB MODE
        elif mode == "web":
            # Get last query for context
            last_query = get_last_query(session_id)

            # Rewrite query with context
            search_query = rewrite_query_with_context(query, last_query)

            web_text, error = search_and_scrape(search_query, num_sites=3)

            if error:
                return jsonify({"error": error}), 400

            web_session_id = f"web_{session_id}_{abs(hash(query))}"
            num_chunks = create_vector_store(web_text, web_session_id)
            print(f"üìÑ Created {num_chunks} chunks")

            context_chunks = retrieve_relevant_chunks(query, web_session_id, k=4)
            context = "\n\n".join(context_chunks)

            # Build prompt with episodic memory
            prompt = f"""Answer using web information.

{episodic_mem}

Web Context:
{context}

Question: {query}

Answer:"""

            return Response(
                stream_with_context(
                    episodic_memory_streamer(generate_stream(prompt), session_id, query)
                ),
                mimetype='text/event-stream',
                headers={'Cache-Control': 'no-cache', 'X-Accel-Buffering': 'no'}
            )

        # MODEL MODE
        else:
            # Check if it's a casual query
            casual_patterns = ['how are you', 'hey', 'hello', 'hi ', 'thanks', 'thank you', 'bye', 'okay', 'ok']
            is_casual = any(pattern in query.lower() for pattern in casual_patterns) or len(query.split()) <= 3

            if is_casual:
                prompt = f"""Respond naturally to this casual message.

{episodic_mem if episodic_mem != "No previous conversation history." else ""}

User: {query}

Respond in 1-2 sentences:"""
            else:
                prompt = f"""{episodic_mem}

Question: {query}

Answer naturally:"""

            return Response(
                stream_with_context(
                    episodic_memory_streamer(generate_stream(prompt), session_id, query)
                ),
                mimetype='text/event-stream',
                headers={'Cache-Control': 'no-cache', 'X-Accel-Buffering': 'no'}
            )

    except Exception as e:
        print(f"Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return jsonify({"error": str(e)}), 500

@app.route("/clear-document", methods=["POST", "OPTIONS"])
def clear_document():
    if request.method == "OPTIONS":
        return jsonify({"status": "ok"}), 200

    try:
        data = request.get_json()
        session_id = data.get("session_id", "default")

        if session_id in vector_stores:
            del vector_stores[session_id]

        clear_episodic_memory(session_id)

        return jsonify({"message": "Document and memory cleared"}), 200
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route("/clear-web-cache", methods=["POST", "OPTIONS"])
def clear_web_cache():
    if request.method == "OPTIONS":
        return jsonify({"status": "ok"}), 200

    try:
        data = request.get_json()
        session_id = data.get("session_id", "default")

        web_sessions = [k for k in vector_stores.keys() if k.startswith(f"web_{session_id}") or k.startswith(f"web_adv_{session_id}")]

        for ws in web_sessions:
            del vector_stores[ws]

        return jsonify({"message": f"Cleared {len(web_sessions)} entries"}), 200
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route("/health", methods=["GET"])
def health():
    doc_sessions = len([k for k in vector_stores.keys() if not k.startswith("web_")])
    web_sessions = len([k for k in vector_stores.keys() if k.startswith("web_")])

    return jsonify({
        "status": "ok",
        "model": MODEL_NAME,
        "active_sessions": {
            "documents": doc_sessions,
            "web_cache": web_sessions,
            "episodic_memory": len(episodic_memory)
        },
        "gpu": torch.cuda.is_available(),
        "features": [
            "episodic_memory_system",
            "context_aware_rewriting",
            "hybrid_memory",
            "all_modes_memory_enabled"
        ]
    })


In [None]:
# Start server
ngrok.set_auth_token(NGROK_TOKEN)
public_url = ngrok.connect(5000).public_url
print(f"üîó API Endpoint: {public_url}")
print(f"üîë Passkey: {PASSKEY}")

if __name__ == "__main__":
    app.run(port=5000, threaded=True)