<a href="https://colab.research.google.com/github/ravikumar-intelsat/AI-Langchain/blob/main/chunksofrepo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [None]:
!pip install -qU langchain-google-vertexai
!pip install google-cloud-aiplatform langchain
!pip install faiss-cpu
!pip install  langchain-community

In [None]:
!pip install langchain-community

In [None]:
import os
import tempfile
from git import Repo
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.cloud import aiplatform
from langchain_google_vertexai import VertexAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document


def clone_repo_to_memory(repo_url):
    """
    Clone a Git repository to a temporary directory.
    """
    temp_dir = tempfile.mkdtemp()
    Repo.clone_from(repo_url, temp_dir)
    return temp_dir

def get_files_recursively(folder_path, file_extensions=None):
    """
    Retrieve all files from a folder recursively, filtering by extensions.
    """
    file_list = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if not file_extensions or file.endswith(tuple(file_extensions)):
                file_list.append(os.path.join(root, file))
    return file_list

def split_file_content(file_path, chunk_size=500, chunk_overlap=50):
    """
    Split file content using RecursiveCharacterTextSplitter.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " "]  # Logical splitting
    )
    return splitter.create_documents([content])

def process_repo(repo_url, file_extensions=None):
    """
    Clone a repo, split file contents recursively.
    """
    repo_path = clone_repo_to_memory(repo_url)
    files = get_files_recursively(repo_path, file_extensions)
    all_chunks = []
    for file in files:
        # print(f"Processing: {file}")
        chunks = split_file_content(file)
        all_chunks.extend(chunks)
    return all_chunks
        # for i, chunk in enumerate(chunks):
        #     print(f"Chunk {i+1}:\n{chunk}\n{'-'*20}")

# Example Usage
# Initialize Google Vertex AI
aiplatform.init(os.getenv('VERTEXAI-PROJECT-ID'), os.getenv('VERTEXAI-LOCATION'))
embedding = VertexAIEmbeddings(model_name="text-embedding-004")


repository_url = "https://github.com/chakra-ui/chakra-ui"
file_extensions = [".ipynb", ".js", ".ts", ".tsx", ".md"]  # Modify extensions based on your repo's content
documents = process_repo(repository_url, file_extensions)
# Store in Vector Database (e.g., FAISS)
vector_db = FAISS.from_documents(documents, embedding)
query_result = vector_db.similarity_search("How to update to Chakra UI V3? elobrate in 1000 words", k=1)
print("Search Result:", query_result)

