<a href="https://colab.research.google.com/github/ravikumar-intelsat/AI-Langchain/blob/main/Github%20clone%20with%20python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain-community

In [None]:
import os
import tempfile
from git import Repo
from langchain.text_splitter import RecursiveCharacterTextSplitter

def clone_repo_to_memory(repo_url):
    """
    Clone a Git repository to a temporary directory.
    """
    temp_dir = tempfile.mkdtemp()
    Repo.clone_from(repo_url, temp_dir)
    return temp_dir

def get_files_recursively(folder_path, file_extensions=None):
    """
    Retrieve all files from a folder recursively, filtering by extensions.
    """
    file_list = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if not file_extensions or file.endswith(tuple(file_extensions)):
                file_list.append(os.path.join(root, file))
    return file_list

def split_file_content(file_path, chunk_size=500, chunk_overlap=50):
    """
    Split file content using RecursiveCharacterTextSplitter.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " "]  # Logical splitting
    )
    return splitter.split_text(content)

def process_repo(repo_url, file_extensions=None):
    """
    Clone a repo, split file contents recursively.
    """
    repo_path = clone_repo_to_memory(repo_url)
    files = get_files_recursively(repo_path, file_extensions)
    for file in files:
        print(f"Processing: {file}")
        chunks = split_file_content(file)
        for i, chunk in enumerate(chunks):
            print(f"Chunk {i+1}:\n{chunk}\n{'-'*20}")

# Example Usage
repository_url = "https://github.com/ravikumar-intelsat/AI-Langchain.git"
file_extensions = [".ipynb", ".js"]  # Modify extensions based on your repo's content
process_repo(repository_url, file_extensions)
