Import Required Libraries

In [120]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import os
from git import Repo
from openai import OpenAI
from langchain.schema import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

In [None]:
load_dotenv()

In [122]:
CLONE_DIR = r"C:\Users\Rohit\Documents\MyProjects\codebase_rag"  # repos will be stored here.

File Processing

In [123]:
SUPPORTED_EXTENSIONS = [".py", ".js", ".tsx", ".ts", ".java", ".cpp"]

IGNORED_DIRS = [".git", "node_modules", "dist", "__pycache__", ".next", ".vscode", ".env", "venv"]

Clones repo if not exits

In [124]:
def clone_repo(repo_url):
    """Clone a repository and return its local path"""
    repo_name = repo_url.split("/")[-1].replace(".git", "")
    repo_path = os.path.join(CLONE_DIR, repo_name)
    
    if not os.path.exists(repo_path):
        Repo.clone_from(repo_url, repo_path)
        print(f"Cloned {repo_name} to {repo_path}")
    else:
        print(f"Repository {repo_name} already exists at {repo_path}")
    
    return repo_path


The function below reads a file's content and returns the relative path and content in a dictionary.

In [125]:
def get_file_content(file_path, repo_path, repo_name):
    try:
        with open(file_path, "r", encoding="utf-8", errors="replace") as f:
            content = f.read()
        rel_path = os.path.relpath(file_path, repo_path)
        return {
            "repo": repo_name,
            "name": rel_path,
            "content": content
        }
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

This function provides a way to gather the contents of supported code files in a repository and returns a list of dictionaries where each dictionary contains the relative path of the file and the contents of the file. (EXTRACTS CONTENT)

In [126]:

def get_main_files_content(repo_path, repo_name):
    files_content = []
    try:
        for root, dirs, files in os.walk(repo_path):
            # Skip ignored directories
            dirs[:] = [d for d in dirs if d not in IGNORED_DIRS]
            
            for file in files:
                if os.path.splitext(file)[1] in SUPPORTED_EXTENSIONS:
                    file_path = os.path.join(root, file)
                    file_content = get_file_content(file_path, repo_path, repo_name)
                    if file_content:
                        files_content.append(file_content)
    except Exception as e:
        print(f"Error processing {repo_name}: {str(e)}")
    
    return files_content


In [127]:
def process_codebases(codebase_inputs):
    """Process multiple codebases (either URLs or local paths)"""
    all_files = []
    
    for input_path in codebase_inputs:
        if input_path.startswith("http"):
            # It's a repository URL - clone it
            repo_path = clone_repo(input_path)
            repo_name = os.path.basename(repo_path)
        else:
            # It's a local path - use directly
            repo_path = input_path
            repo_name = os.path.basename(repo_path)
        
        # Get files from this codebase
        files_content = get_main_files_content(repo_path, repo_name)
        all_files.extend(files_content)
    
    return all_files


In [128]:
embedding_model = OpenAIEmbeddings()

File content > Documents

In [129]:
def create_documents(files_content):
    documents = []
    for file in files_content:
        doc = Document(
            page_content=f"REPO: {file['repo']}\nFILE: {file['name']}\nCONTENT:\n{file['content']}",
            metadata={
                "repo": file['repo'],
                "source": file['name']
            }
        )
        documents.append(doc)
    return documents


Store embeddings in chromaDB

In [130]:

def create_vector_store(documents):
    return Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        collection_name="multi-codebase-rag"
    )


Given Codebase inputs

In [None]:
CODEBASE_INPUTS = [
    "https://github.com/Aider-AI/aider.git",
    "https://github.com/langchain-ai/langchain.git",
    "https://github.com/evershopcommerce/evershop.git"
    ]
    

In [None]:
all_files = process_codebases(CODEBASE_INPUTS)
documents = create_documents(all_files)
vectorstore = create_vector_store(documents) 

In [133]:
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
llm = ChatOpenAI(model="gpt-4-turbo")

In [134]:
query = "How does the Stripe payment module work?"

In [135]:
# Retrieve top 5 docs
relevant_docs = vectorstore.similarity_search(query=query, k=5)
contexts = [doc.page_content for doc in relevant_docs]
formatted_context = "\n\n-------\n\n".join(contexts)

In [136]:
SYSTEM_TEMPLATE = """Answer questions using information from these codebases:
    {context}
    
    Guidelines:
    - Specify which repository (REPO) the information comes from
    - If information comes from multiple repos, note that.
    - If unsure, say which repos you checked.
    - Don't invent anything not in the context.
    - Use three sentences maximum and keep the answer concise.
    """
    
prompt = ChatPromptTemplate.from_messages([
        ("system", SYSTEM_TEMPLATE),
        ("human", "Question: {question}")
    ])

In [137]:
chain = prompt | llm  #The pipe operator (|) is used here to create a “chain” that connects the prompt template with the language model.
response = chain.invoke({"context": formatted_context, "question": query})

In [138]:
print("\nAnswer:")
print(response.content)


Answer:
From the Evershop repository, the Stripe payment module is integrated in a way that validates the payment method during the checkout process and manages the lifecycle of payment statuses. Specifically, it includes functionality to push 'payment_method' into cart fields with a validator that checks if Stripe is enabled as a payment method. Moreover, it adjusts payment statuses such as 'authorized', 'failed', 'refunded', and 'partial_refunded', and handles canceling payment intents if the payment status changes to 'canceled'. Additionally, it processes the response from the Stripe API after a payment attempt, updating the order's payment status based on the 'succeeded' or 'requires_capture' status of the Stripe payment intent.
