In [1]:
# Import necessary libraries for our RAG application and teaching agent
from dotenv import load_dotenv
import os
from langchain_groq import ChatGroq
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain.schema import Document
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
import glob
import matplotlib.pyplot as plt  # For plotting graphs
import time  # For timing stages

In [2]:
import os 
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# Step 1: Setup Environment and LLM
# This is like setting up your classroom with the right tools
def setup_environment():
    load_dotenv()
    if not os.getenv("GROQ_API_KEY"):
        raise ValueError("Please set GROQ_API_KEY in your .env file!")
    llm = ChatGroq(model="llama-3.1-8b-instant")
    print("Environment setup complete. Your AI teacher is ready!")
    return llm

In [4]:
llm = setup_environment()

Environment setup complete. Your AI teacher is ready!


In [5]:
import os
import glob
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import re

def generate_tree_structure(directory):
    """Generate a textual representation of the directory tree."""
    tree = []
    for root, dirs, files in os.walk(directory):
        level = root.replace(directory, '').count(os.sep)
        indent = '  ' * level
        tree.append(f"{indent}{os.path.basename(root)}/")
        for f in files:
            tree.append(f"{indent}  {f}")
    return "\n".join(tree)

def infer_code_flow(files, directory):
    """Basic inference of code flow by looking at imports/requires."""
    flow = []
    import_patterns = {
        'py': r"import\s+[\w.]+\s*(?:as\s+\w+)?|from\s+[\w.]+\s+import\s+[\w.*]+",
        'js': r"import\s+.*?\s+from\s+['\"].*?['\"]|require\(['\"].*?['\"]\)",
        'jsx': r"import\s+.*?\s+from\s+['\"].*?['\"]|require\(['\"].*?['\"]\)"
    }
    
    for file_path in files:
        ext = file_path.rsplit('.', 1)[-1] if '.' in file_path else ''
        if ext in import_patterns:
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                imports = re.findall(import_patterns[ext], content)
                if imports:
                    rel_path = os.path.relpath(file_path, directory)
                    flow.append(f"{rel_path} depends on:")
                    for imp in imports:
                        flow.append(f"  - {imp.strip()}")
            except Exception as e:
                flow.append(f"Error analyzing {file_path}: {e}")
    return "\n".join(flow) if flow else "No detectable dependencies found."

def load_documents(directory="code_folder"):
    """Load code files, tree, and flow from the directory."""
    # Verify directory exists
    abs_dir = os.path.abspath(directory)
    if not os.path.isdir(abs_dir):
        print(f"Error: Directory '{abs_dir}' does not exist!")
        return []
    print(f"Scanning directory: {abs_dir}")

    # Define supported extensions
    extensions = ("py", "js", "jsx", "ts", "java", "c", "cpp", "cs", "go", "rs", 
                  "php", "rb", "sh", "txt", "md", "html", "css", "yaml", "yml", "conf")
    glob_pattern = "**/*.{" + ",".join(extensions) + "}"
    print(f"Using glob pattern: {glob_pattern}")

    # Find all matching files
    matching_files = []
    for ext in extensions:
        matching_files.extend(glob.glob(os.path.join(directory, f"**/*.{ext}"), recursive=True))
    print(f"Found {len(matching_files)} files:")
    for file in matching_files[:5]:
        print(f"  {file}")
    if len(matching_files) > 5:
        print(f"  ...and {len(matching_files) - 5} more")

    # Load documents
    documents = []


    # Add tree structure
    tree_content = generate_tree_structure(directory)
    documents.append(Document(page_content=tree_content, metadata={"source": "files tree"}))

    # Add code flow
    flow_content = infer_code_flow(matching_files, directory)
    documents.append(Document(page_content=flow_content, metadata={"source": "code flow"}))


    for file_path in matching_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            doc = Document(page_content=content, metadata={"source": os.path.relpath(file_path, directory)})
            documents.append(doc)
        except Exception as e:
            print(f"Failed to load {file_path}: {e}")


    # # Debug: Count files per subfolder
    # subfolders = set()
    # for doc in documents:
    #     source = doc.metadata.get("source", "")
    #     if source not in ["directory_tree.txt", "code_flow.txt"]:
    #         subfolder = os.path.dirname(source)
    #         subfolders.add(subfolder if subfolder else "root")

    # print(f"Loaded {len(documents)} files across {len(subfolders)} subfolders:")
    # for subfolder in sorted(subfolders):
    #     if subfolder == "root":
    #         file_count = sum(1 for doc in documents 
    #                         if "/" not in doc.metadata.get("source", "") 
    #                         and doc.metadata["source"] not in ["directory_tree.txt", "code_flow.txt"])
    #     else:
    #         file_count = sum(1 for doc in documents 
    #                         if doc.metadata.get("source", "").startswith(subfolder + "/") 
    #                         and doc.metadata["source"] not in ["directory_tree.txt", "code_flow.txt"])
    #     if file_count > 0:
    #         print(f"  {subfolder}: {file_count} files")

    # # Save all documents to a single file
    # with open("codes_file.txt", "w", encoding='utf-8') as f:
    #     for doc in documents:
    #         print(doc)  # Print the full Document object
    #         print(doc.page_content)  # Print the content
    #         f.write(f"Source: {doc.metadata['source']}\n\n{doc.page_content}\n\n{'='*50}\n\n")

    # Split into chunks with optimized size
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    split_docs = text_splitter.split_documents(documents)
    print(f"Loaded {len(documents)} files, split into {len(split_docs)} chunks")
    return split_docs

In [6]:
def create_vector_store(docs, persist_dir="./chroma_db"):
    embeddings = OllamaEmbeddings(
        model="mxbai-embed-large",
    )
    vector_store = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        persist_directory=persist_dir
    )
    print(f"Vector store created at {persist_dir}")
    return vector_store

In [7]:
from langchain.tools.retriever import create_retriever_tool

def load_chroma_retriever(persist_dir: str = "./chroma_db"):
    embeddings = OllamaEmbeddings(
        model="mxbai-embed-large",
    )
    vector_store = Chroma(
        persist_directory=persist_dir,
        embedding_function=embeddings
    )
    return vector_store.as_retriever()

In [8]:
code_directory="otter-detection"
persist_dir="./chroma_db"
if not os.path.exists(persist_dir):
    docs = load_documents(code_directory)
    if docs:
        create_vector_store(docs, persist_dir)
    else:
        print("No code files found!")
# Initialize the retriever tool
retriever = load_chroma_retriever()
retriever_tool = create_retriever_tool(
    retriever,
    "codebase_retriever",
    "Search for information about the codebase related componets codes"
)

Scanning directory: /home/saurabh-nitro/projects/AgenticAI/CodeExplorerRAG/otter-detection
Using glob pattern: **/*.{py,js,jsx,ts,java,c,cpp,cs,go,rs,php,rb,sh,txt,md,html,css,yaml,yml,conf}
Found 71 files:
  otter-detection/src/main.py
  otter-detection/src/decoder.py
  otter-detection/src/server.py
  otter-detection/src/analyse.py
  otter-detection/src/tracker.py
  ...and 66 more
Loaded 73 files, split into 1076 chunks
Vector store created at ./chroma_db


In [9]:
# Search the database with a sample query
query = "directory_tree"
retrieved_docs = retriever.invoke(query)

In [10]:
retrieved_docs

[Document(id='4f0bb4cb-47be-4a19-a7f4-ce414eeae3a6', metadata={'source': 'static/recorder/unused/lib/mwc.min.js'}, page_content='n=r.shadowRoot;if(n)return void c(n,t);if("content"==r.localName){for(var o=r,a=o.getDistributedNodes?o.getDistributedNodes():[],d=0;d<a.length;d++)c(a[d],t);return}if("slot"==r.localName){for(var s=r,l=s.assignedNodes?s.assignedNodes({flatten:!0}):[],u=0;u<l.length;u++)c(l[u],t);return}}for(var p=e.firstChild;null!=p;)c(p,t),p=p.nextSibling}function d(e){if(!e.querySelector("style#inert-style, link#inert-style")){var t=document.createElement("style");t.setAttribute("id","inert-style"),t.textContent="\\n[inert] {\\n  pointer-events: none;\\n  cursor: default;\\n}\\n\\n[inert], [inert] * {\\n  -webkit-user-select: none;\\n  -moz-user-select: none;\\n  -ms-user-select: none;\\n  user-select: none;\\n}\\n",e.appendChild(t)}}}();'),
 Document(id='c127169d-fd84-49ef-b117-ece25e982c1a', metadata={'source': 'static/assets/mwc.min.js'}, page_content="inertSubroot.man

In [11]:
# Search the database with a sample query
query = "code_flow"
retrieved_docs = retriever.invoke(query)


In [12]:
retrieved_docs

[Document(id='5711ab98-5fa7-4aaa-af7f-96539cad6310', metadata={'source': 'static/recorder/unused/lib/msgpack.min.js'}, page_content='o[1];return{value:o[0]?o[1]:void 0,done:!0}}([o,a])}}},j=function(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t,n=e[Symbol.asyncIterator];return n?n.call(e):(e="function"==typeof __values?__values(e):e[Symbol.iterator](),t={},r("next"),r("throw"),r("return"),t[Symbol.asyncIterator]=function(){return this},t);function r(n){t[n]=e[n]&&function(t){return new Promise((function(r,i){!function(e,t,n,r){Promise.resolve(r).then((function(t){e({value:t,done:n})}),t)}(r,i,(t=e[n](t)).done,t.value)}))}}},F=function(e){return this instanceof F?(this.v=e,this):new F(e)},W=function(e,t,n){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var r,i=n.apply(e,t||[]),o=[];return r={},s("next"),s("throw"),s("return"),r[Symbol.asyncIterator]=function(){return this},r;function s(e){i[e]&&(r[e]=f

In [13]:
# Search the database with a sample query
query = "react"
retrieved_docs = retriever.get_relevant_documents(query)

  retrieved_docs = retriever.get_relevant_documents(query)


In [14]:
retrieved_docs

[Document(id='bc22e86a-de30-4f18-b127-c4a9443102dd', metadata={'source': 'server/static/devel/src/live/unused/live-video-player.jsx'}, page_content='"use strict"\n\nimport React, { useState, useEffect, useRef } from "react"\n\nexport function LivePage({\n    selectedDevice\n}) {\n    <GridContainer isGridView={selectedDevice == 0} style={{\n        flex: 1,\n        width: "100%",\n    }}>\n        {deviceData.map((x, i) => {\n            return (selectedDevice == 0 || selectedDevice == i + 1) && <VideoPlayer\n                key={i}\n                ref={x => video_player_refs.current[i] = x}\n                device_number={x.device_number}\n                ptz={selectedDevice == i + 1}\n                overlay={enableOverlay}\n                onClick={_ => {\n                    console.log("Clicked video player", i)\n                    list_item_refs.current[i].click()\n                }}\n            ></VideoPlayer>\n        })}\n    </GridContainer>\n}'),
 Document(id='2406db5e-8

In [15]:
# Print the most relevant chunk retrieved
print("Top Retrieved Document:\n", retrieved_docs[0].page_content)

Top Retrieved Document:
 "use strict"

import React, { useState, useEffect, useRef } from "react"

export function LivePage({
    selectedDevice
}) {
    <GridContainer isGridView={selectedDevice == 0} style={{
        flex: 1,
        width: "100%",
    }}>
        {deviceData.map((x, i) => {
            return (selectedDevice == 0 || selectedDevice == i + 1) && <VideoPlayer
                key={i}
                ref={x => video_player_refs.current[i] = x}
                device_number={x.device_number}
                ptz={selectedDevice == i + 1}
                overlay={enableOverlay}
                onClick={_ => {
                    console.log("Clicked video player", i)
                    list_item_refs.current[i].click()
                }}
            ></VideoPlayer>
        })}
    </GridContainer>
}


In [16]:
from langchain.chains import RetrievalQA
# Initialize ChatGPT with RAG (Retrieval-Augmented Generation)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever
)

In [17]:
# from langchain.chains import RetrievalQA
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

# Set up message history storage
store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

# Wrap your chain with message history support
qa_with_history = RunnableWithMessageHistory(
    qa_chain,
    get_session_history,
    input_messages_key="query",  # Changed from "question" to "query"
    history_messages_key="chat_history"
)

In [18]:
# Example usage
session_id = "user123"

# First question - notice we use "query" as the key
response = qa_with_history.invoke(
    {"query": """explain react code. how its connected with python tornado websocket 
                explain briefly each and every code by writing it step by step"""},
    config={"configurable": {"session_id": session_id}}
)
print(response["result"])

I'll explain the React code step by step, and then explain how it connects with the Python Tornado Websocket.

**React Code**

```javascript
// WebSocketVideoPlayer component
export function WebSocketVideoPlayer({
  ws_ref, // optional
  url,
  videoPath, // optional
  onFirstFrame,
  onFrame,
  showOverlay,
  verbose,
  extraText,
  style,
  ...props
}) {
  // State variables
  const [videoState, setVideoState] = useState(DISCONNECTED);
  const [urlAndPayload, setUrlAndPayload] = useState({});
  const [decoderKey, setDecoderKey] = useState();

  // References
  const fg_fast_ref = useRef();
  const fg_slow_ref = useRef();
  const bg_ref = useRef();
  const decoder_ref = useRef();
  ws_ref = ws_ref || useRef();

  // Effect hook to initialize WebSocket connection when URL changes
  useEffect(() => {
    if (!url) return; // No WebSocket connection if URL is empty
    // Create a unique key to prevent identical payloads from interfering with each other
    const x = { url, key: +new Dat

In [19]:
# First question
# First question - notice we use "query" as the key
response = qa_with_history.invoke(
    {"query": """fetch all React base code and write it down"""},
    config={"configurable": {"session_id": session_id}}
)
print(response["result"])

Here is the React base code as per the provided context:

```javascript
"use strict"

import { createRoot } from "react-dom/client"
import React, { useState, useEffect, useRef } from "react"

import { Drawer } from "./mwc/drawer"
import { List, ListItem, Divider } from "./mwc/list"
import { Icon, IconButton } from "./mwc/icon"
import { TabBar, Tab } from "./mwc/tab-bar"

import { LeftAndRightLogo, WhiteLogo } from "./logo"
import { useState2 } from "./util"
import { DeviceList } from "./device-list"
import { LivePage } from "./live/live-page"
import { RecorderPage } from "./recorder/recorder-page"

import { getTXRX } from "./recorder/websocket"

import { DataGridAndChart } from "./monitor/data-grid-and-chart"

function App() {
    const [drawerOpen, setDrawerOpen] = useState(false)
    const [selectAllDisabled, setSelectAllDisabled] = useState()
    const [compactLayout, setCompactLayout] = useState(false)

    // URL hash states
    const [tabValue, setTabValue] = useState2("tab", "Li

In [20]:
response = qa_with_history.invoke(
    {"query": """fetch tree of files in codebase provided"""},
    config={"configurable": {"session_id": session_id}}
)
print(response["result"])

Based on the code you provided, I don't see any information about the file structure of the codebase. However, I can try to infer the file structure based on the code.

Here's a possible file structure for the codebase:

```
.
inert_subroot.py
inert_node.py
pipeline_handler.py
gstreamer_utils.py
common.py
utils.py
main.py
requirements.txt
glob.py (imported, likely a custom implementation)
disk_usage.py (imported, likely a custom implementation)
IOLoop.py (imported, likely a custom implementation)
Gst.py (imported, likely a custom implementation)
influx_query.py (imported, likely a custom implementation)
config.py (possible, not shown in the code)
EXTERNAL_ROOT (path, likely a configuration)
INTERNAL_ROOT (path, likely a configuration)
```

To fetch the actual tree of files in the codebase, you can use the `os` module in Python:

```python
import os

def get_file_tree():
    for root, dirs, files in os.walk("."):
        for dir in dirs:
            print(f"  {dir}")
        for file in

In [None]:
# Ask a question and get an answer
query = "explain react code. how its connected with python tornado wesocket"
response = qa_chain.run(query)

# Print the generated response
print("\nGenerated Answer:\n", response)

In [None]:
# Ask a question and get an answer
query = """explain react code. how its connected with python tornado wesocket
explain brifly each and every code
"""
response = qa_chain.run(query)

# Print the generated response
print("\nGenerated Answer:\n", response)

In [None]:
# Import necessary libraries for our RAG application and teaching agent
from dotenv import load_dotenv
import os
from langchain_groq import ChatGroq
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain.schema import Document
from langgraph.graph import StateGraph, END
from typing import TypedDict, List
import glob
import matplotlib.pyplot as plt  # For plotting graphs
import time  # For timing stages

import os 
from dotenv import load_dotenv
load_dotenv()

# Step 1: Setup Environment and LLM
# This is like setting up your classroom with the right tools
def setup_environment():
    load_dotenv()
    if not os.getenv("GROQ_API_KEY"):
        raise ValueError("Please set GROQ_API_KEY in your .env file!")
    llm = ChatGroq(model="llama-3.1-8b-instant")
    print("Environment setup complete. Your AI teacher is ready!")
    return llm

llm = setup_environment()

import os
import glob
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import re

def generate_tree_structure(directory):
    """Generate a textual representation of the directory tree."""
    tree = []
    for root, dirs, files in os.walk(directory):
        level = root.replace(directory, '').count(os.sep)
        indent = '  ' * level
        tree.append(f"{indent}{os.path.basename(root)}/")
        for f in files:
            tree.append(f"{indent}  {f}")
    return "\n".join(tree)

def infer_code_flow(files, directory):
    """Basic inference of code flow by looking at imports/requires."""
    flow = []
    import_patterns = {
        'py': r"import\s+[\w.]+\s*(?:as\s+\w+)?|from\s+[\w.]+\s+import\s+[\w.*]+",
        'js': r"import\s+.*?\s+from\s+['\"].*?['\"]|require\(['\"].*?['\"]\)",
        'jsx': r"import\s+.*?\s+from\s+['\"].*?['\"]|require\(['\"].*?['\"]\)"
    }
    
    for file_path in files:
        ext = file_path.rsplit('.', 1)[-1] if '.' in file_path else ''
        if ext in import_patterns:
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                imports = re.findall(import_patterns[ext], content)
                if imports:
                    rel_path = os.path.relpath(file_path, directory)
                    flow.append(f"{rel_path} depends on:")
                    for imp in imports:
                        flow.append(f"  - {imp.strip()}")
            except Exception as e:
                flow.append(f"Error analyzing {file_path}: {e}")
    return "\n".join(flow) if flow else "No detectable dependencies found."

def load_documents(directory="code_folder"):
    """Load code files, tree, and flow from the directory."""
    # Verify directory exists
    abs_dir = os.path.abspath(directory)
    if not os.path.isdir(abs_dir):
        print(f"Error: Directory '{abs_dir}' does not exist!")
        return []
    print(f"Scanning directory: {abs_dir}")

    # Define supported extensions
    extensions = ("py", "js", "jsx", "ts", "java", "c", "cpp", "cs", "go", "rs", 
                  "php", "rb", "sh", "txt", "md", "html", "css", "yaml", "yml", "conf")
    glob_pattern = "**/*.{" + ",".join(extensions) + "}"
    print(f"Using glob pattern: {glob_pattern}")

    # Find all matching files
    matching_files = []
    for ext in extensions:
        matching_files.extend(glob.glob(os.path.join(directory, f"**/*.{ext}"), recursive=True))
    print(f"Found {len(matching_files)} files:")
    for file in matching_files[:5]:
        print(f"  {file}")
    if len(matching_files) > 5:
        print(f"  ...and {len(matching_files) - 5} more")

    # Load documents
    documents = []


    # Add tree structure
    tree_content = generate_tree_structure(directory)
    documents.append(Document(page_content=tree_content, metadata={"source": "files tree"}))

    # Add code flow
    flow_content = infer_code_flow(matching_files, directory)
    documents.append(Document(page_content=flow_content, metadata={"source": "code flow"}))


    for file_path in matching_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            doc = Document(page_content=content, metadata={"source": os.path.relpath(file_path, directory)})
            documents.append(doc)
        except Exception as e:
            print(f"Failed to load {file_path}: {e}")


    # Split into chunks with optimized size
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    split_docs = text_splitter.split_documents(documents)
    print(f"Loaded {len(documents)} files, split into {len(split_docs)} chunks")
    return split_docs



def create_vector_store(docs, persist_dir="./chroma_db"):
    embeddings = OllamaEmbeddings(
        model="mxbai-embed-large",
    )
    vector_store = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        persist_directory=persist_dir
    )
    print(f"Vector store created at {persist_dir}")
    return vector_store


from langchain.tools.retriever import create_retriever_tool

def load_chroma_retriever(persist_dir: str = "./chroma_db"):
    embeddings = OllamaEmbeddings(
        model="mxbai-embed-large",
    )
    vector_store = Chroma(
        persist_directory=persist_dir,
        embedding_function=embeddings
    )
    return vector_store.as_retriever()


code_directory="otter-detection"
persist_dir="./chroma_db"
if not os.path.exists(persist_dir):
    docs = load_documents(code_directory)
    if docs:
        create_vector_store(docs, persist_dir)
    else:
        print("No code files found!")
# Initialize the retriever tool
retriever = load_chroma_retriever()
retriever_tool = create_retriever_tool(
    retriever,
    "codebase_retriever",
    "Search for information about the codebase related componets codes"
)


from langchain.chains import RetrievalQA
# Initialize ChatGPT with RAG (Retrieval-Augmented Generation)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever
)


# from langchain.chains import RetrievalQA
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

# Set up message history storage
store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

# Wrap your chain with message history support
qa_with_history = RunnableWithMessageHistory(
    qa_chain,
    get_session_history,
    input_messages_key="query",  # Changed from "question" to "query"
    history_messages_key="chat_history"
)

# Example usage
session_id = "user123"

# First question - notice we use "query" as the key
response = qa_with_history.invoke(
    {"query": """explain react code. how its connected with python tornado websocket 
                explain briefly each and every code by writing it step by step"""},
    config={"configurable": {"session_id": session_id}}
)
print(response["result"])