In [None]:
!pip install gitpython==3.1.41
!pip install langchain-core==0.2.1
!pip install langchain-openai==0.1.7
!pip install openai==1.30.3
# !pip install faiss-cpu==1.7.3
!pip install faiss-gpu==1.7.2
!pip install aiofiles==23.1.0
!pip install python-dotenv==1.0.0

In [1]:
import os
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings

from dotenv import load_dotenv

load_dotenv(".env")

# Set the storage path
storage_path = os.environ.get("GIT_STORAGE_PATH", "/tmp")
faiss_index_path = storage_path + "/faiss_index"
repo_path = os.environ.get('GIT_REPO_PATH')
FAISS_NO_AVX2 = os.environ.get('FAISS_NO_AVX2', 'true')
print('repo_path:', repo_path)
print('faiss_index_path:', faiss_index_path)
print('FAISS_NO_AVX2:', FAISS_NO_AVX2)

store = LocalFileStore("./cache/")
#embeddings_model = OpenAIEmbeddings()
model_name = "sentence-transformers/all-mpnet-base-v2"
underlying_embeddings_model = HuggingFaceEmbeddings(model_name=model_name)

embeddings = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings_model, store, namespace="embeddings"
)


repo_path: /home/hchen/src/github.com/OpenDevin
faiss_index_path: /home/hchen/src/faiss/faiss_index
FAISS_NO_AVX2: true


In [2]:
import git
from typing import Iterator
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS

class GitRepoCommitsLoader(BaseLoader):
    """
    A custom loader for git repository commits that extracts commit messages and diffs as documents.

    Attributes:
        repo_path (str): The path to the git repository.
    """

    def __init__(self, repo_path: str) -> None:
        """
        Initializes the GitRepoCommitsLoader with the specified repository path.

        Args:
            repo_path (str): The path to the git repository.
        """
        self.repo_path = repo_path

    def lazy_load(self) -> Iterator[Document]:
        """
        Lazily loads the commit messages and diffs from the git repository.

        Yields:
            Iterator[Document]: An iterator of Document objects containing commit messages and diffs.
        """
        repo = git.Repo(self.repo_path)
        all_commits = list(repo.iter_commits())
        print('commits:', len(all_commits))
        # use a subset
        commits = all_commits[10:300]
        for commit in commits:
            commit_message = commit.message
            commit_diff = commit.diff(create_patch=True)

            diff_texts = []
            for diff in commit_diff:
                diff_texts.append(diff.diff.decode('utf-8'))

            commit_diff_text = '\n'.join(diff_texts)
            content = f"Commit Message:\n{commit_message}\n\nCommit Diff:\n{commit_diff_text}"

            yield Document(page_content=content,
                           metadata={
                               "commit_hash": commit.hexsha,
                               "author": commit.author.name,
                               "date": commit.committed_datetime.isoformat()
                           })


loader = GitRepoCommitsLoader(repo_path=repo_path)
documents = list(loader.lazy_load())


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_index_git_repository(documents):
    print(f'load and index {len(documents)} documents into vectorstore {faiss_index_path}')
    docs = [] 
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)

    print(f"Loaded {len(documents)} documents")
    for d in documents:
        # reset the metadata
        ds = []
        ds.append(d)
        splitted_doc = text_splitter.split_documents(ds)
        pages = 0
        for s in splitted_doc:
            s.metadata['page'] = pages
            pages += 1
            s.metadata['commit_hash'] = d.metadata['commit_hash']
            s.metadata['author'] = d.metadata['author']
            s.metadata['date'] = d.metadata['date']          
            docs.append(s)
    vectorstore = FAISS.from_documents(docs, embeddings)
    vectorstore.save_local(faiss_index_path)

    return vectorstore, documents, docs

vectordb, documents, docs = load_and_index_git_repository(documents)

load and index 290 documents into vectorstore /home/hchen/src/faiss/faiss_index
Loaded 290 documents


In [5]:
def similarity_search(vectorstore, query, score=0.8):
    results = vectorstore.similarity_search_with_relevance_scores(query)
    ret = []
    for i in results:
        doc = i[0]
        relevance_score = i[1]
        # print('relevance_score:', relevance_score)
        if relevance_score > score:
            ret.append(doc)
    return ret

query = "custom git commit"
results = similarity_search(vectordb, query, score=0.2)

for result in results:
    print(result.metadata)


{'commit_hash': '8d79c3edbc3ef0c8a41e1bab823279c335037e47', 'author': 'Yizhe Zhang', 'date': '2024-06-02T06:38:09+00:00', 'page': 0}
{'commit_hash': '1bdf8752e6459987d0ed3554a8b1b4007303c760', 'author': 'Yufan Song', 'date': '2024-06-08T19:04:43+00:00', 'page': 227}
{'commit_hash': 'a97d0767e90934d0dcc82e281e8da76af220a937', 'author': 'tobitege', 'date': '2024-06-08T21:02:27+00:00', 'page': 227}
{'commit_hash': '68d9ad61cf59724c761c4f700d4b7be25690f320', 'author': 'yueqis', 'date': '2024-06-08T16:54:54+00:00', 'page': 228}


In [6]:
orig_doc = documents
orig_docs = docs
print('orig_doc:', len(orig_doc))
print('orig_docs:', len(orig_docs))

orig_doc: 290
orig_docs: 116410


In [8]:
docs = orig_docs
counter = 0
for doc in docs:
    counter += 1
    
    query = doc.page_content
    results = vectordb.similarity_search_with_score(query=query, k=10)
    
    similar_results = []
    for result in results:
        if type(result) is tuple: 
            doc = result[0]
            score = result[1]
            if score > 0.7:
                similar_results.append(doc)
    if len(similar_results) > 0:
        print(f'{counter}/{len(docs)}: {len(similar_results)} similar documents')


1/116410: 4 similar documents
518/116410: 8 similar documents
520/116410: 5 similar documents
524/116410: 7 similar documents
9982/116410: 7 similar documents
10257/116410: 7 similar documents
10533/116410: 7 similar documents
13548/116410: 9 similar documents
13550/116410: 5 similar documents
13554/116410: 7 similar documents
18677/116410: 8 similar documents
18679/116410: 5 similar documents
18683/116410: 7 similar documents


KeyboardInterrupt: 