                                                      **CODEBASE RAG**

# Installing Necessary Libraries

In [None]:
! pip install pygithub langchain langchain-community openai tiktoken pinecone-client langchain_pinecone sentence-transformers

Collecting pygithub
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting pynacl>=1.4.0 (from pygithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from pinecone import Pinecone

  from tqdm.autonotebook import tqdm, trange


# Cloning a Github Repository

Creating a function to get the datas from a github repo by cloning the repo locally

In [None]:
def clone_repository(repo_url) :
  """Cloning a github repository to a temporary directory.
  Args :
    repo_url: The URL of the GITHUB repository.

  Returns :
    The path to the cloned repository.
  """
  repo_name = repo_url.split("/")[-1]
  repo_path = f"/content/{repo_name}"
  Repo.clone_from(repo_url, str(repo_path))
  return str(repo_path)

In [None]:
path = clone_repository("https://github.com/CoderAgent/SecureAgent")

In [None]:
print(path)

/content/SecureAgent


In [None]:
SUPPORTED_EXTENSIONS = {'.py', '.js', '.tsx', '.jsx', '.ipynb', '.java', '.cpp', '.ts', '.go', '.rs', '.vue', '.swift', '.c', '.h' }

IGNORED_DIRS = {'node_modules', 'venv', 'env', 'dist', 'build', '.git', '__pycache__', '.next', '.vscode', 'vendor'}

# Getting the content of the files

Creating function to get the content of files we will work on later

In [None]:
def get_file_content(file_path, repo_path) :
  """
  Get content of a single file

  Args :
    file_path (str) : Path to the file

  Returns :
    Optional[Dict[str,str]] : Dictionary with file name and content
  """

  try :
    with open(file_path, 'r', encoding='utf-8') as file :
      content = file.read()

      # Get relative path from repo root
      rel_path = os.path.relpath(file_path, repo_path)
      return {
          "name": rel_path,
          "content": content
      }
  except Exception as e :
    print(f"Error reading file {file_path}: {str(e)}")
    return None


def get_main_files_content(repo_path : str ) :
  """
  Get content of supported code files from the local repository

  Args :
    repo_path (str) : Path to the local repository

  Returns :
    List of dictionaries containing file names and contents
  """
  files_content = []

  try :

    for root, _, files in os.walk(repo_path) :
      # Skip if current directory is in ignored directories
      if any(ignored_dir in root for ignored_dir in IGNORED_DIRS) :
        continue

      # Process each file in current directory
      for file in files :
        file_path = os.path.join(root, file)
        if os.path.splitext(file)[1] in SUPPORTED_EXTENSIONS :
          file_content = get_file_content(file_path, repo_path)
          if file_content :
            files_content.append(file_content)
  except Exception as e :
    print(f"Error processing directory {root}: {str(e)}")

  return files_content

In [None]:
file_content = get_main_files_content(path)

In [None]:
file_content

[{'name': 'src/prompts.ts',
  'content': 'import { encode, encodeChat } from "gpt-tokenizer";\nimport type { ChatCompletionMessageParam } from "groq-sdk/resources/chat/completions";\nimport type { PRFile } from "./constants";\nimport {\n  rawPatchStrategy,\n  smarterContextPatchStrategy,\n} from "./context/review";\nimport { GROQ_MODEL, type GroqChatModel } from "./llms/groq";\n\nconst ModelsToTokenLimits: Record<GroqChatModel, number> = {\n  "mixtral-8x7b-32768": 32768,\n  "gemma-7b-it": 32768,\n  "llama3-70b-8192": 8192,\n  "llama3-8b-8192": 8192,\n};\n\nexport const REVIEW_DIFF_PROMPT = `You are PR-Reviewer, a language model designed to review git pull requests.\nYour task is to provide constructive and concise feedback for the PR, and also provide meaningful code suggestions.\n\nExample PR Diff input:\n\'\n## src/file1.py\n\n@@ -12,5 +12,5 @@ def func1():\ncode line that already existed in the file...\ncode line that already existed in the file....\n-code line that was removed in t

# Embeddings

Creating the embedding function using the hugging Face AI

In [None]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2") :
  model = SentenceTransformer(model_name)
  return model.encode(text)

Texting my embedding function

In [None]:
text = "My name is sheick"

embedding = get_huggingface_embeddings(text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
embedding

array([ 6.55414835e-02,  5.06030731e-02,  1.46450056e-02, -3.81328911e-02,
        4.38303947e-02,  1.68378581e-03,  1.87954046e-02,  9.01181158e-03,
        3.00738346e-02, -1.37723668e-03, -1.39317950e-02, -2.32384671e-02,
        1.67793427e-02, -1.04574403e-02,  8.50111246e-03, -6.32634014e-02,
       -3.44188116e-03, -3.49146128e-02,  2.58255396e-02, -6.18998939e-03,
       -2.76029985e-02,  9.71040863e-04,  6.43676985e-03,  3.91971506e-02,
        3.24107036e-02, -2.57421229e-02,  5.26524559e-02, -2.21876823e-03,
        5.37676178e-03,  5.76634966e-02, -2.09077783e-02, -2.62748562e-02,
       -4.20101359e-02,  8.16939250e-02,  1.91147410e-06, -1.34888981e-02,
        1.57199483e-02, -1.08866645e-02, -9.99653637e-02, -3.81573034e-03,
       -5.00430353e-02, -6.01682924e-02, -4.49774079e-02, -4.37611230e-02,
       -3.14336061e-03,  1.99228954e-02,  7.06105381e-02,  1.71465743e-02,
       -1.29203610e-02, -3.41897504e-03,  1.80938356e-02, -2.51713265e-02,
       -1.31447911e-02, -

# Setting up Pinecone

We are doing this because we're gonna use Pinecone to store our embeddings

In [None]:
# Setting the PINECONE_API_KEY as an environment variable
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = pinecone_api_key

# Initialiazing Pinecone
pc = Pinecone(api_key=userdata.get("PINECONE_API_KEY"),)

#Connecting to our Pinecone index
pinecone_index = pc.Index("codebase-rag")

In [None]:
vectorstore = PineconeVectorStore(index_name='codebase-rag', embedding=HuggingFaceEmbeddings())

  vectorstore = PineconeVectorStore(index_name='codebase-rag', embedding=HuggingFaceEmbeddings())
  vectorstore = PineconeVectorStore(index_name='codebase-rag', embedding=HuggingFaceEmbeddings())


Preparing Documents for the Vectorstore in Pinecone

In [None]:
documents = []

for file in file_content:
    doc = Document(
        page_content=f"{file['name']}\n{file['content']}",
        metadata={"source": file['name']}
    )

    documents.append(doc)


# Storing the document in Pinecone by creating a vector store

vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=HuggingFaceEmbeddings(),
    index_name="codebase-rag",
    namespace="https://github.com/CoderAgent/SecureAgent"
)

  embedding=HuggingFaceEmbeddings(),


# Performming RAG

In [None]:
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=userdata.get("GROQ_API_KEY")
)


In [None]:
query = "How are python files parsed ?"

In [None]:
raw_query_embedding = get_huggingface_embeddings(query)

raw_query_embedding

array([ 5.29357232e-02, -6.24647178e-02, -2.87437718e-02,  1.83179416e-02,
       -4.33840672e-04,  4.03239094e-02, -7.76652806e-03, -2.74394872e-03,
        2.53445264e-02, -8.10819939e-02, -8.44583288e-03, -6.59269514e-03,
        4.16187495e-02,  3.98627296e-02,  2.82911733e-02,  2.84344628e-02,
        2.65303329e-02, -2.60126498e-02,  4.16299142e-02,  3.92820686e-02,
       -5.15580364e-02,  5.83349541e-02,  5.88829117e-03,  3.46064568e-02,
       -2.46872660e-03,  2.72809248e-02,  1.07212560e-02,  4.55760621e-02,
       -1.69188846e-02, -4.85301390e-02, -3.02424375e-02, -3.29698175e-02,
        2.46010050e-02,  3.23601812e-02,  1.16030503e-06,  9.71379410e-03,
       -3.70800160e-02,  1.84200946e-02, -1.39834182e-02,  4.25723344e-02,
        6.78140819e-02, -6.66246563e-02,  2.11651716e-02, -1.11712900e-03,
       -1.80115458e-02, -7.90139660e-02,  5.93152717e-02, -5.23733571e-02,
        5.63013554e-02,  4.31280173e-02,  7.77091645e-03, -2.30586994e-02,
       -2.94572674e-02,  

performing a query against a Pinecone index to find the most relevant documents to a given query.

In [None]:
# Feel free to change the "top_k" parameter to be a higher or lower number
top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=5, include_metadata=True, namespace="https://github.com/CoderAgent/SecureAgent")

In [None]:
top_matches

{'matches': [{'id': 'a064e527-61da-42d9-89ce-6dec22f54548',
              'metadata': {'source': 'src/context/language/python-parser.ts',
                           'text': 'src/context/language/python-parser.ts\n'
                                   'import { AbstractParser, EnclosingContext '
                                   '} from "../../constants";\n'
                                   'export class PythonParser implements '
                                   'AbstractParser {\n'
                                   '  findEnclosingContext(\n'
                                   '    file: string,\n'
                                   '    lineStart: number,\n'
                                   '    lineEnd: number\n'
                                   '  ): EnclosingContext {\n'
                                   '    // TODO: Implement this method for '
                                   'Python\n'
                                   '    return null;\n'
                          

In [None]:
contexts = [item['metadata']['text'] for item in top_matches['matches']]

In [None]:
contexts

['src/context/language/python-parser.ts\nimport { AbstractParser, EnclosingContext } from "../../constants";\nexport class PythonParser implements AbstractParser {\n  findEnclosingContext(\n    file: string,\n    lineStart: number,\n    lineEnd: number\n  ): EnclosingContext {\n    // TODO: Implement this method for Python\n    return null;\n  }\n  dryRun(file: string): { valid: boolean; error: string } {\n    // TODO: Implement this method for Python\n    return { valid: false, error: "Not implemented yet" };\n  }\n}\n',
 'src/context/language/javascript-parser.ts\nimport { AbstractParser, EnclosingContext } from "../../constants";\nimport * as parser from "@babel/parser";\nimport traverse, { NodePath, Node } from "@babel/traverse";\n\nconst processNode = (\n  path: NodePath<Node>,\n  lineStart: number,\n  lineEnd: number,\n  largestSize: number,\n  largestEnclosingContext: Node | null\n) => {\n  const { start, end } = path.node.loc;\n  if (start.line <= lineStart && lineEnd <= end.li

Setting up and Sending a Prompt to the Large Language Model (LLM)

In [None]:
augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

In [None]:
print(augmented_query)

<CONTEXT>
src/context/language/python-parser.ts
import { AbstractParser, EnclosingContext } from "../../constants";
export class PythonParser implements AbstractParser {
  findEnclosingContext(
    file: string,
    lineStart: number,
    lineEnd: number
  ): EnclosingContext {
    // TODO: Implement this method for Python
    return null;
  }
  dryRun(file: string): { valid: boolean; error: string } {
    // TODO: Implement this method for Python
    return { valid: false, error: "Not implemented yet" };
  }
}


-------

src/context/language/javascript-parser.ts
import { AbstractParser, EnclosingContext } from "../../constants";
import * as parser from "@babel/parser";
import traverse, { NodePath, Node } from "@babel/traverse";

const processNode = (
  path: NodePath<Node>,
  lineStart: number,
  lineEnd: number,
  largestSize: number,
  largestEnclosingContext: Node | null
) => {
  const { start, end } = path.node.loc;
  if (start.line <= lineStart && lineEnd <= end.line) {
    const

In [None]:
system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript.

Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
"""

llm_response = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augmented_query}
    ]
)

response = llm_response.choices[0].message.content

In [None]:
response

"Python files are currently not properly parsed in this codebase. \n\nThe `src/context/language/python-parser.ts` file is intended for Python parsing, but it's currently not implemented. The `findEnclosingContext` method is supposed to find the enclosing context for a given line in a Python file, but it returns `null`. The `dryRun` method is also not implemented and simply returns a hardcoded error message.\n\nTo properly parse Python files, you would need to use a Python parser library like `ast` or `pdb` in Python, or a library like `python-parser` in TypeScript. However, in this codebase, the implementation for Python parsing is missing.\n\nIn contrast, JavaScript files are properly parsed using the `@babel/parser` and `@babel/traverse` libraries."

# Putting it all together

In [None]:
def perform_rag(query):
    raw_query_embedding = get_huggingface_embeddings(query)

    top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=5, include_metadata=True, namespace="https://github.com/CoderAgent/SecureAgent")

    # Get the list of retrieved texts
    contexts = [item['metadata']['text'] for item in top_matches['matches']]

    augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

    # Modify the prompt below as need to improve the response quality
    system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript.

    Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
    """

    llm_response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": augmented_query}
        ]
    )

    return llm_response.choices[0].message.content

In [None]:
response = perform_rag("How is the javascript parser used?")

print(response)

The `JavascriptParser` is used in the following places in the codebase:

1. `src/context/review.ts`: This is the main entry point of the review process. The `JavascriptParser` is used to parse the file contents of a Pull Request (PR). When the parser gets called, it is passed the file contents, a line start number, and a line end number. The parser uses these values to find the enclosing JavaScript function in the file between the given line numbers. The `diffContextPerHunk` function, which is used to compute the context for the hunks of a patch, also uses the `findEnclosingContext` method of the `JavascriptParser` to get the enclosing context for each hunk. If the parser is successful in finding an enclosing function, the context of the function is used to determine how to format the patch. 

2. `src/context/review.ts` (again): In the `smarterContextPatchStrategy` function, if the file has a known extension (e.g., .js, .ts, etc.), the `getParserForExtension` function is used to retrie