In [None]:
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from git import Repo
import os
from dotenv import load_dotenv

In [15]:
load_dotenv()

True

In [4]:
! mkdir repo_data

In [5]:
repo = Repo.clone_from("https://github.com/saurav-sabu/MediSage",to_path="repo_data/")

In [7]:
loader = GenericLoader.from_filesystem(
    "repo_data/",
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(language=Language.PYTHON,parser_threshold=500)
)

In [8]:
documents = loader.load()

In [9]:
documents

[Document(metadata={'source': 'repo_data\\app.py', 'language': <Language.PYTHON: 'python'>}, page_content='from flask import Flask, request, jsonify, render_template\nfrom src.helper import *\nfrom langchain_pinecone import PineconeVectorStore\nfrom langchain.chains import create_retrieval_chain, create_history_aware_retriever\nfrom langchain.chains.combine_documents import create_stuff_documents_chain\nfrom langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\nfrom langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings\nfrom src.prompt import *\nfrom langchain_core.messages import *\nimport os\nfrom dotenv import load_dotenv\n\nload_dotenv()\n\napp = Flask(__name__)\n\nembedding = initialize_embedding()\n\ndocsearch = PineconeVectorStore.from_existing_index(\n    index_name="medisage",\n    embedding=embedding\n)\n\nretriever = docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})\n\nchat_history = []\n\nmodel = ChatGoogleG

In [10]:
len(documents)

7

In [11]:
splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=500,
    chunk_overlap=200
)

In [12]:
chunks = splitter.split_documents(documents)

In [13]:
len(chunks)

25

In [16]:
embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [23]:
vector_db = FAISS.from_documents(chunks,embedding)

In [24]:
vector_db.save_local("faiss-index")

In [25]:
retriever = vector_db.as_retriever()

In [18]:
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

In [32]:
system_prompt = (
    "You are an advanced AI with expert-level understanding of all programming languages, frameworks, "
    "and best practices. You have access to the entire codebase provided by the user and can analyze it in depth. "
    "Your role is to assist the user by answering queries related to the code, explaining functionalities, "
    "suggesting improvements, debugging issues, and providing best practices. "
    "Ensure your responses are clear, concise, and technically accurate. "
    "If additional context is required, ask the user for clarification.\n\n"
    "Here is the provided code:\n + {context}"
)


In [33]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}")
    ]
)

question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [36]:
rag_chain.invoke({"input":"What embedding model is being used"})["answer"]

'The embedding model being used is "models/text-embedding-004" from Google Generative AI, as initialized in the `initialize_embedding` function:\n\n```python\ndef initialize_embedding():\n    embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")\n    return embedding\n```'