In [1]:
import os
import ast
from typing import List
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

def extract_functions_and_classes_from_code(code: str, file_path: str, module: str) -> List[Document]:
    tree = ast.parse(code)
    lines = code.splitlines()
    chunks = []

    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
            start_line = node.lineno - 1
            end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line + 1
            source = "\n".join(lines[start_line:end_line])
            name = node.name
            kind = "function" if isinstance(node, ast.FunctionDef) else "class"
            
            # Add a helpful descriptive header to the chunk
            enriched_content = f"{kind} `{name}` in module `{module}`:\n\n{source}"
            doc = Document(
                page_content=enriched_content,
                metadata={
                    "name": name,
                    "type": kind,
                    "file": os.path.basename(file_path),
                    "module": module,
                    "start_line": start_line + 1,
                    "end_line": end_line
                }
            )
            chunks.append(doc)

    return chunks

def path_to_module(repo_root: str, file_path: str) -> str:
    rel_path = os.path.relpath(file_path, repo_root)
    no_ext = os.path.splitext(rel_path)[0]
    return no_ext.replace(os.sep, ".")

def crawl_repo(repo_path: str) -> List[Document]:
    all_doc = []
    for root, _, files in os.walk(repo_path):
        for file in files:
            if file.endswith(".py"):
                full_path = os.path.join(root, file)
                try:
                    with open(full_path, "r", encoding="utf-8") as f:
                        code = f.read()
                        module = path_to_module(repo_path, full_path)
                        all_doc.extend(extract_functions_and_classes_from_code(code, full_path, module))
                except Exception as e:
                    print(f" Error parsing {file}: {e}")
    return all_doc

embedding_model = HuggingFaceEmbeddings(
    model_name="bge-code-v1",  
    model_kwargs={"device": "cpu"}
)


repo_path = "repo"  
documents = crawl_repo(repo_path)
print(documents)

  embedding_model = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 27.48it/s]


[Document(metadata={'name': 'get_db', 'type': 'function', 'file': 'database.py', 'module': 'database', 'start_line': 14, 'end_line': 19}, page_content='function `get_db` in module `database`:\n\ndef get_db():\n    db = SessionLocal()\n    try:\n        yield db\n    finally:\n        db.close()'), Document(metadata={'name': 'get_current_user', 'type': 'function', 'file': 'authmiddleware.py', 'module': 'auth.authmiddleware', 'start_line': 14, 'end_line': 39}, page_content='function `get_current_user` in module `auth.authmiddleware`:\n\ndef get_current_user(\n    token: str = Depends(oauth2_scheme),\n    db: Session = Depends(get_db)   # Add DB session\n) -> DBUser:\n    payload = verify_access_token(token)\n    if payload is None:\n        raise HTTPException(\n            status_code=status.HTTP_401_UNAUTHORIZED,\n            detail="Invalid or expired token",\n            headers={"WWW-Authenticate": "Bearer"},\n        )\n    username = payload.get("sub")\n    if username is None:\n 

In [None]:

chroma_dir = "./chroma_db"
vectordb = Chroma.from_documents(
    documents,
    embedding=embedding_model,
    persist_directory=chroma_dir
)
vectordb.persist()

print(f"✅ {len(documents)} chunks stored in Chroma at {chroma_dir}")


In [2]:

vectordb = Chroma(
    persist_directory= "./chroma_db",
    embedding_function=embedding_model
)
retriever = vectordb.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 5,
        "lambda_mult" : 0.7
    }
)


  vectordb = Chroma(


In [11]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainFilter 
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    disable_streaming=False,
    callbacks=[StreamingStdOutCallbackHandler()],                    
)

base_retriever = vectordb.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 5,
        "lambda_mult": 0.7
    }
)

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=base_retriever,
    llm=llm,
)

compressor = LLMChainFilter.from_llm(llm) 

combined_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=multi_query_retriever
)

In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain


memory = ConversationBufferMemory(
    memory_key="history",
    input_key="query"
)


  memory = ConversationBufferMemory(


In [None]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["history", "query", "context"],
    template="""
You are a highly intelligent GitHub assistant.

Conversation so far:
{history}

Relevant code context from the repository:
{context}

User question:
{query}

---

Instructions:
1. Begin with a direct answer.
2. Then provide a detailed explanation using specific references to function names and file/modules (e.g., `auth.routes`, `auth.utils`).
3. Use code blocks where helpful.
4. If the feature is missing, respond: "There is no such feature implemented in this repo."
5. Suggest implementation ideas if possible.
6. Use clear, structured formatting (markdown-friendly).

Your Answer:
"""
)


classifier_prompt = PromptTemplate(
    input_variables=["query"],
    template="""
You are a classifier. Decide if this user query requires additional code context to answer.

If the query is about a specific repo feature, implementation, or functionality, respond with "yes".
If it's a casual question or memory-based (like follow-up or clarification), respond with "no".

Query: {query}
Answer:
"""
)

classifier_chain = LLMChain(llm=llm, prompt=classifier_prompt)

chain = LLMChain(
    llm=llm,
    prompt=prompt,
    memory=memory,
)


def github_assistant_chat(query: str):
    classification = classifier_chain.run(query).strip().lower()

    if classification == "yes":
        docs = combined_retriever.invoke(query)
        context = "\n\n".join([doc.page_content for doc in docs])
    else:
        context = ""

    response = chain.run({
        "query": query,
        "context": context
    })

    return response


In [None]:
print(github_assistant_chat("How Manim Videos are generated in this repo ?"))

In [None]:
print(github_assistant_chat("Can you explain how scene planing is done ?"))

In [13]:
print(llm.invoke("hello"))

content='Hello! How can I help you today?' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []} id='run--cff5b5d9-c4b2-47ad-a059-c1ba51982f99-0' usage_metadata={'input_tokens': 1, 'output_tokens': 10, 'total_tokens': 11, 'input_token_details': {'cache_read': 0}}
