# Install Necessary Libraries

In [1]:
! pip install pygithub langchain langchain-community openai tiktoken pinecone-client langchain_pinecone sentence-transformers

Collecting pygithub
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting openai
  Downloading openai-1.58.1-py3-none-any.whl.metadata (27 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting Deprecated (from pygithub)
  Downloading Deprecated-1.2.15-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting langchain-core<0.4.0,>=0.3.26 (from langchain)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
C

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-cloud-auth 0.1.4 requires pydantic<2.0, but you have pydantic 2.10.4 which is incompatible.
streamlit 1.30.0 requires packaging<24,>=16.8, but you have packaging 24.2 which is incompatible.


In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from pinecone import Pinecone

In [5]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Retrieve API keys
openai_api_key = os.getenv("OPENAI_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_environment = os.getenv("PINECONE_ENVIRONMENT")

# Validate that all keys are loaded
if not all([openai_api_key, groq_api_key, pinecone_api_key, pinecone_environment]):
    raise ValueError("One or more API keys are missing in the .env file.")


# Clone a GitHub Repo locally

In [7]:
def clone_repository(repo_url):
    """Clones a GitHub repository to a temporary directory.

    Args:
        repo_url: The URL of the GitHub repository.

    Returns:
        The path to the cloned repository.
    """
    repo_name = repo_url.split("/")[-1]  # Extract repository name from URL
    repo_path = f"/content/{repo_name}"
    Repo.clone_from(repo_url, str(repo_path))
    return str(repo_path)

In [8]:
path = clone_repository("https://github.com/CoderAgent/SecureAgent")

In [10]:
print(path)

/content/SecureAgent


In [11]:
SUPPORTED_EXTENSIONS = {'.py', '.js', '.tsx', '.jsx', '.ipynb', '.java',
                         '.cpp', '.ts', '.go', '.rs', '.vue', '.swift', '.c', '.h'}

IGNORED_DIRS = {'node_modules', 'venv', 'env', 'dist', 'build', '.git',
                '__pycache__', '.next', '.vscode', 'vendor'}

In [16]:
def get_file_content(file_path, repo_path):
    """
    Get content of a single file.

    Args:
        file_path (str): Path to the file

    Returns:
        Optional[Dict[str, str]]: Dictionary with file name and content
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Get relative path from repo root
        rel_path = os.path.relpath(file_path, repo_path)

        return {
            "name": rel_path,
            "content": content
        }
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None


def get_main_files_content(repo_path: str):
    """
    Get content of supported code files from the local repository.

    Args:
        repo_path: Path to the local repository

    Returns:
        List of dictionaries containing file names and contents
    """
    files_content = []

    try:
        for root, _, files in os.walk(repo_path):
            # Skip if current directory is in ignored directories
            if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
                continue

            # Process each file in current directory
            for file in files:
                file_path = os.path.join(root, file)
                if os.path.splitext(file)[1] in SUPPORTED_EXTENSIONS:
                    file_content = get_file_content(file_path, repo_path)
                    if file_content:
                        files_content.append(file_content)

    except Exception as e:
        print(f"Error reading repository: {str(e)}")

    return files_content

In [17]:
file_content = get_main_files_content(path)

In [18]:
file_content

[{'name': 'src\\app.ts',
  'content': 'import { Octokit } from "@octokit/rest";\nimport { createNodeMiddleware } from "@octokit/webhooks";\nimport { WebhookEventMap } from "@octokit/webhooks-definitions/schema";\nimport * as http from "http";\nimport { App } from "octokit";\nimport { Review } from "./constants";\nimport { env } from "./env";\nimport { processPullRequest } from "./review-agent";\nimport { applyReview } from "./reviews";\n\n// This creates a new instance of the Octokit App class.\nconst reviewApp = new App({\n  appId: env.GITHUB_APP_ID,\n  privateKey: env.GITHUB_PRIVATE_KEY,\n  webhooks: {\n    secret: env.GITHUB_WEBHOOK_SECRET,\n  },\n});\n\nconst getChangesPerFile = async (payload: WebhookEventMap["pull_request"]) => {\n  try {\n    const octokit = await reviewApp.getInstallationOctokit(\n      payload.installation.id\n    );\n    const { data: files } = await octokit.rest.pulls.listFiles({\n      owner: payload.repository.owner.login,\n      repo: payload.repository.n

# Embeddings

In [19]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    return model.encode(text)

In [20]:
text = "I am a programmer"

embeddings = get_huggingface_embeddings(text)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
embeddings

array([ 1.81737803e-02, -3.02661373e-03, -4.77465801e-02,  1.86379105e-02,
        3.14538032e-02,  1.87255293e-02, -1.52534032e-02, -6.77293390e-02,
       -1.26903364e-02,  1.28427437e-02,  5.80701306e-02,  4.00234871e-02,
        3.27073447e-02,  7.12998286e-02,  5.56373484e-02,  1.68628637e-02,
        6.97603747e-02, -5.02620079e-02,  6.13143807e-03, -1.46559048e-02,
       -4.51960601e-03,  4.82934676e-02, -2.53051352e-02, -1.97859993e-03,
       -4.36902605e-02, -2.41507199e-02,  1.29505824e-02, -3.78610799e-03,
       -2.05718204e-02,  1.09819286e-01,  3.07674892e-03, -2.80443635e-02,
       -1.55807342e-02, -1.24789970e-02,  1.75239131e-06, -2.93752039e-03,
       -1.43048353e-02,  4.88386378e-02, -6.21114224e-02,  2.95061208e-02,
       -1.40470555e-02,  2.20708400e-02,  1.13067729e-02,  4.70893271e-02,
        7.58308312e-03, -8.30223798e-05,  6.67821094e-02, -1.21320402e-02,
        4.39384161e-03,  2.47453619e-02,  1.02528920e-02, -6.54434413e-03,
       -5.53150754e-03, -

# Setting up Pinecone
**1. Create an account on [Pinecone.io](https://app.pinecone.io/)**

**2. Create a new index called "codebase-rag" and set the dimensions to 768. Leave the rest of the settings as they are.**

![Screenshot 2024-11-24 at 10 58 50 PM](https://github.com/user-attachments/assets/f5fda046-4087-432a-a8c2-86e061005238)



**3. Create an API Key for Pinecone**

![Screenshot 2024-11-24 at 10 44 37 PM](https://github.com/user-attachments/assets/e7feacc6-2bd1-472a-82e5-659f65624a88)


**4. Store your Pinecone API Key within Google Colab's secrets section, and then enable access to it (see the blue checkmark)**

![Screenshot 2024-11-24 at 10 45 25 PM](https://github.com/user-attachments/assets/eaf73083-0b5f-4d17-9e0c-eab84f91b0bc)



In [22]:
# Set the PINECONE_API_KEY as an environment variable
pinecone_api_key = os.getenv("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"),)

# Connect to your Pinecone index
pinecone_index = pc.Index("codebase-rag")

In [23]:
vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())

  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())
  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())


In [24]:
documents = []

for file in file_content:
    doc = Document(
        page_content=f"{file['name']}\n{file['content']}",
        metadata={"source": file['name']}
    )

    documents.append(doc)


vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=HuggingFaceEmbeddings(),
    index_name="codebase-rag",
    namespace="https://github.com/CoderAgent/SecureAgent"
)

  embedding=HuggingFaceEmbeddings(),


In [25]:
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=os.getenv("GROQ_API_KEY")
)

In [26]:
query = "How are python files parsed?"

In [27]:
raw_query_embedding = get_huggingface_embeddings(query)

raw_query_embedding

array([ 5.29357232e-02, -6.24647290e-02, -2.87437774e-02,  1.83179714e-02,
       -4.33826295e-04,  4.03239094e-02, -7.76650710e-03, -2.74392799e-03,
        2.53445338e-02, -8.10820088e-02, -8.44583753e-03, -6.59269001e-03,
        4.16187569e-02,  3.98627333e-02,  2.82911714e-02,  2.84344796e-02,
        2.65303440e-02, -2.60126591e-02,  4.16299067e-02,  3.92820686e-02,
       -5.15580289e-02,  5.83349802e-02,  5.88830002e-03,  3.46064828e-02,
       -2.46876152e-03,  2.72809193e-02,  1.07212691e-02,  4.55760658e-02,
       -1.69188920e-02, -4.85301316e-02, -3.02424338e-02, -3.29698250e-02,
        2.46010181e-02,  3.23601924e-02,  1.16030503e-06,  9.71378293e-03,
       -3.70800123e-02,  1.84200685e-02, -1.39834126e-02,  4.25723083e-02,
        6.78141043e-02, -6.66246414e-02,  2.11651661e-02, -1.11712434e-03,
       -1.80115551e-02, -7.90139660e-02,  5.93152642e-02, -5.23733422e-02,
        5.63013367e-02,  4.31280099e-02,  7.77091645e-03, -2.30586808e-02,
       -2.94572730e-02,  

In [28]:
# Feel free to change the "top_k" parameter to be a higher or lower number
top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=5, include_metadata=True, namespace="https://github.com/CoderAgent/SecureAgent")

In [29]:
top_matches

{'matches': [{'id': '08e24cd6-ae3b-42c9-a100-7c288d93446c',
              'metadata': {'source': 'src\\context\\language\\python-parser.ts',
                           'text': 'src\\context\\language\\python-parser.ts\n'
                                   'import { AbstractParser, EnclosingContext '
                                   '} from "../../constants";\n'
                                   'export class PythonParser implements '
                                   'AbstractParser {\n'
                                   '  findEnclosingContext(\n'
                                   '    file: string,\n'
                                   '    lineStart: number,\n'
                                   '    lineEnd: number\n'
                                   '  ): EnclosingContext {\n'
                                   '    // TODO: Implement this method for '
                                   'Python\n'
                                   '    return null;\n'
                    

In [30]:
contexts = [item['metadata']['text'] for item in top_matches['matches']]

In [31]:
contexts

['src\\context\\language\\python-parser.ts\nimport { AbstractParser, EnclosingContext } from "../../constants";\nexport class PythonParser implements AbstractParser {\n  findEnclosingContext(\n    file: string,\n    lineStart: number,\n    lineEnd: number\n  ): EnclosingContext {\n    // TODO: Implement this method for Python\n    return null;\n  }\n  dryRun(file: string): { valid: boolean; error: string } {\n    // TODO: Implement this method for Python\n    return { valid: false, error: "Not implemented yet" };\n  }\n}\n',
 'src/context/language/python-parser.ts\nimport { AbstractParser, EnclosingContext } from "../../constants";\nexport class PythonParser implements AbstractParser {\n  findEnclosingContext(\n    file: string,\n    lineStart: number,\n    lineEnd: number\n  ): EnclosingContext {\n    // TODO: Implement this method for Python\n    return null;\n  }\n  dryRun(file: string): { valid: boolean; error: string } {\n    // TODO: Implement this method for Python\n    return {

In [32]:
augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

In [33]:
print(augmented_query)

<CONTEXT>
src\context\language\python-parser.ts
import { AbstractParser, EnclosingContext } from "../../constants";
export class PythonParser implements AbstractParser {
  findEnclosingContext(
    file: string,
    lineStart: number,
    lineEnd: number
  ): EnclosingContext {
    // TODO: Implement this method for Python
    return null;
  }
  dryRun(file: string): { valid: boolean; error: string } {
    // TODO: Implement this method for Python
    return { valid: false, error: "Not implemented yet" };
  }
}


-------

src/context/language/python-parser.ts
import { AbstractParser, EnclosingContext } from "../../constants";
export class PythonParser implements AbstractParser {
  findEnclosingContext(
    file: string,
    lineStart: number,
    lineEnd: number
  ): EnclosingContext {
    // TODO: Implement this method for Python
    return null;
  }
  dryRun(file: string): { valid: boolean; error: string } {
    // TODO: Implement this method for Python
    return { valid: false, err

In [34]:
system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript.

Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
"""

llm_response = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augmented_query}
    ]
)

response = llm_response.choices[0].message.content

In [35]:
print(response)

Based on the provided codebase, it appears that Python files are not currently being parsed. 

In the `src/context/language/python-parser.ts` and `src/context/language/python-parser.ts` files, there are two identical classes named `PythonParser` that implement the `AbstractParser` interface. However, the `findEnclosingContext` and `dryRun` methods in these classes have TODO comments and return null or a placeholder object, indicating that they are not yet implemented.

In contrast, the `src/context/language/javascript-parser.ts` file contains a fully implemented `JavascriptParser` class that uses the Babel parser and traverser to analyze JavaScript code and find the enclosing context.

To parse Python files, you would need to implement the `findEnclosingContext` and `dryRun` methods in the `PythonParser` class, possibly using a Python parsing library such as `pygment` or `ast`. 

Here's a basic example of how you might implement the `dryRun` method for Python using the `parse` function

# Putting it all together

In [36]:
def perform_rag(query):
    raw_query_embedding = get_huggingface_embeddings(query)

    top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=5, include_metadata=True, namespace="https://github.com/CoderAgent/SecureAgent")

    # Get the list of retrieved texts
    contexts = [item['metadata']['text'] for item in top_matches['matches']]

    augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

    # Modify the prompt below as need to improve the response quality
    system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript.

    Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
    """

    llm_response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": augmented_query}
        ]
    )

    return llm_response.choices[0].message.content

In [37]:
response = perform_rag("How is the javascript parser used?")

print(response)

The JavascriptParser class is used in several places in the codebase to parse JavaScript files and find enclosing contexts.

Based on the provided code, here are a few ways the JavascriptParser class is used:

1. **In the diffContextPerHunk function**: This function uses the JavascriptParser class to find the enclosing context of a change in a given patch. It uses the parser to determine the scope of the function where the change belongs. Specifically, it calls the findEnclosingContext method of the JavascriptParser class to find the enclosing function for each change in the patch.

   ```javascript
const largestEnclosingFunction = parser.findEnclosingContext(
  updatedFile,
  lineStart,
  lineEnd
).enclosingContext;
```

2. **In the smarterContextPatchStrategy function**: This function first tries to use a "smarter" approach to patching, which involves using the functionContextPatchStrategy function. If this fails (for example, because the parser is unavailable), it falls back to the 