In [2]:

!pip install -q langchain_community faiss-cpu sentence-transformers python-dotenv
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from IPython.display import display, Markdown
import os
from dotenv import load_dotenv
import pickle

load_dotenv()

def prepare_sample_data():
    """Lightweight mock data preparation"""
    return [
        Document(
            page_content="Model Requirement: Use gpt-3.5-turbo-0125 for all assignments",
            metadata={"source": "course-guidelines", "type": "policy"}
        ),
        Document(
            page_content="Submission Deadline: Every Friday by 5PM IST",
            metadata={"source": "course-schedule", "type": "temporal"}
        ),
        Document(
            page_content="Grading: Assignments count for 40% of final grade",
            metadata={"source": "grading-policy", "type": "policy"}
        )
    ]

class VectorStoreManager:
    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings(
            model_name="all-MiniLM-L6-v2",  
            model_kwargs={'device': 'cpu'}   # Force CPU usage
        )
        self.vector_store = None
    
    def create_store(self, documents):
        """Create and cache FAISS index"""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=300,  # Smaller chunks for memory efficiency
            chunk_overlap=50,
            length_function=len
        )
        texts = text_splitter.split_documents(documents)
        self.vector_store = FAISS.from_documents(texts, self.embeddings)
        return self.vector_store
    
    def save_store(self, path="faiss_index"):
        """Persist vector store"""
        if self.vector_store:
            self.vector_store.save_local(path)
    
    def load_store(self, path="faiss_index"):
        """Load existing vector store"""
        self.vector_store = FAISS.load_local(path, self.embeddings)
        return self.vector_store


class VirtualTA:
    def __init__(self):
        self.vs_manager = VectorStoreManager()
        
        # Try loading existing store first
        try:
            self.vs_manager.load_store()
            print("Loaded existing vector store")
        except:
            print("Creating new vector store")
            docs = prepare_sample_data()
            self.vs_manager.create_store(docs)
            self.vs_manager.save_store()
    
    def ask(self, question, k=2):
        """Optimized query with pretty Jupyter output"""
        if not self.vs_manager.vector_store:
            raise ValueError("Vector store not initialized")
        
        docs = self.vs_manager.vector_store.similarity_search(question, k=k)
        
        # Prepare Markdown output
        answer = "## Answer\n"
        answer += "\n".join(f"- {doc.page_content}" for doc in docs)
        
        answer += "\n\n## Sources\n"
        answer += "\n".join(
            f"- [{doc.metadata['source']}]({doc.metadata.get('url', '#')}): "
            f"{doc.page_content[:60]}..."
            for doc in docs
        )
        
        display(Markdown(answer))
        return docs


ta = VirtualTA()




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Creating new vector store


In [None]:
def load_real_data():
    documents = []
    
    # Example: Load from JSON files
    import json
    with open('course_materials.json') as f:
        for item in json.load(f):
            documents.append(Document(
                page_content=f"{item['title']}\n{item['content']}",
                metadata={"source": item['url'], "type": item['type']}
            ))
    
    # Add Discourse forum scraping
    from bs4 import BeautifulSoup
    import requests
    forum_url = "https://discourse.onlinedegree.iitm.ac.in"
    response = requests.get(f"{forum_url}/latest.json")
    data = response.json()
    
    for topic in data['topic_list']['topics']:
        documents.append(Document(
            page_content=f"Forum: {topic['title']}\n{topic['excerpt']}",
            metadata={"source": f"{forum_url}/t/{topic['id']}", "type": "forum"}
        ))
    
    return documents

# Reinitialize with real data
real_docs = load_real_data()
ta.vs_manager.create_store(real_docs)
ta.vs_manager.save_store("real_data_index")

In [None]:
!pip install -q fastapi uvicorn

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import nest_asyncio
import uvicorn

app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.post("/ask")
async def ask_question(question: str):
    docs = ta.ask(question, return_raw=True)
    return {
        "answer": "\n".join(doc.page_content for doc in docs),
        "sources": [doc.metadata for doc in docs]
    }

# Run in Jupyter
nest_asyncio.apply()
uvicorn.run(app, host="0.0.0.0", port=8000)