In [1]:
!pip -q install \
  notebook \
  aiohttp \
  faiss-cpu \
  "torch>=2.2,<3.0" \
  sentence-transformers \
  tree_sitter_python \
  tree_sitter \
  google.genai


In [2]:
import httpx
import asyncio
from pathlib import Path


async def fetch_github_repo_content(url: str) -> dict:
    headers = {
        "Accept": "application/vnd.github.object",
        "X-GitHub-Api-Version": "2022-11-28"
    }

    async with httpx.AsyncClient() as client:
        response = await client.get(url, headers=headers)
        response.raise_for_status()
        return response.json()
    return {}

async def fetch_file_content_from_download_url(download_url: str, client: httpx.AsyncClient, semaphore: asyncio.Semaphore) -> str:
    async with semaphore:
        response = await client.get(download_url)
        response.raise_for_status()
        return response.text
    return ""

async def fetch_list_of_file_name_content_tuples(github_repo_content: dict, max_concurrent: int = 3):
    python_files = [item for item in github_repo_content['entries'] if item['name'].endswith('.py')]
    semaphore = asyncio.Semaphore(max_concurrent)

    async with httpx.AsyncClient() as client:
        content_tasks = []
        for pf in python_files:
            content_task = fetch_file_content_from_download_url(
                pf['download_url'],
                client,
                semaphore
            )
            content_tasks.append(content_task)

        contents = await asyncio.gather(*content_tasks)

    name_content_tuples = list(zip([pf['name'] for pf in python_files], contents))
    return name_content_tuples


def write_python_files(relative_path: str, name_content_tuples: list) -> None:
    output_dir = Path.cwd() / relative_path
    output_dir.mkdir(parents=True, exist_ok=True)
    for name, content in name_content_tuples:
        file_path = output_dir / name
        file_path.write_text(content, encoding='utf-8')


async def fetch_urls_and_save_in_dirs():
    urls = [
        "https://api.github.com/repos/neetcode-gh/leetcode/contents/python?ref=main",
        "https://api.github.com/repos/TheAlgorithms/Python/contents/sorts?ref=master"
    ]

    dir_names = ['neetcode', 'sorts']
    for i in range(len(dir_names)):
        url = urls[i]
        dir_name = dir_names[i]

        repo = await fetch_github_repo_content(url)
        names_contents = await fetch_list_of_file_name_content_tuples(repo)
        write_python_files(dir_name, names_contents)

await fetch_urls_and_save_in_dirs()


In [3]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
from pathlib import Path
PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)


def chunk_python_code_in_functions_and_classes(code:str):
    tree = parser.parse(bytes(code, "utf8"))
    root_node = tree.root_node
    function_node = root_node.children
    functions_and_classes = [f for f in function_node if f.type in ['function_definition', 'class_definition']]
    return [fc.text for fc in functions_and_classes]


def get_chunks_of_fuctions_and_classes_from_dir(dir_path):
    base = Path(dir_path)
    result = []
    for file in base.rglob("*"):
        if file.is_file() and file.suffix == '.py':
            code = file.read_text(encoding="utf-8", errors="ignore")
            chunks = chunk_python_code_in_functions_and_classes(code)
            result.extend(chunks)
    return result


In [4]:

try:
    from google import genai
    from google.colab import userdata
    from google.genai import types
    google_key = userdata.get('GEMINI_API_KEY')
except:
    from google import genai
    from google.genai import types
    import os
    from dotenv import load_dotenv
    load_dotenv()
    google_key = os.getenv('GEMINI_API_KEY')

client = genai.Client(api_key=google_key)

In [5]:
def generate_embeddings(chunks:list[str]):
  result = client.models.embed_content(
          model="gemini-embedding-001",
          contents=chunks,
          config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT")
          )
  embeddings = np.array([embedding.values for embedding in result.embeddings])
  embeddings = embeddings.astype('float32')

  return embeddings


In [None]:
embeddings = generate_embeddings()


In [6]:
import numpy as np, faiss
dim = embeddings.shape[1]
faiss.normalize_L2(embeddings)
metric = faiss.METRIC_INNER_PRODUCT
M=32
base = faiss.IndexHNSWFlat(dim, M, metric)
base.hnsw.efConstruction = 200
base.hnsw.efSearch = 64
index = faiss.IndexIDMap2(base)
ids = [i for i in range(embeddings.shape[0])]
ids = np.array(ids)
index.add_with_ids(embeddings, ids)

NameError: name 'embeddings' is not defined

In [None]:
# Save embeddings, FAISS index, and chunks to disk
import pickle

# Create a directory for saved data
save_dir = Path('saved_data')
save_dir.mkdir(exist_ok=True)

# 1. Save embeddings as numpy array
np.save(save_dir / 'embeddings.npy', embeddings)
print(f"Saved embeddings to {save_dir / 'embeddings.npy'}")

# 2. Save FAISS index
faiss.write_index(index, str(save_dir / 'faiss_index.index'))
print(f"Saved FAISS index to {save_dir / 'faiss_index.index'}")

# 3. Save chunks (code snippets) as pickle for easy retrieval
with open(save_dir / 'chunks.pkl', 'wb') as f:
    pickle.dump(chunks, f)
print(f"Saved chunks to {save_dir / 'chunks.pkl'}")

print(f"\nAll data saved to '{save_dir}' directory")


In [None]:
# Load saved embeddings, FAISS index, and chunks from disk
load_dir = Path('saved_data')

# 1. Load embeddings
loaded_embeddings = np.load(load_dir / 'embeddings.npy')
print(f"Loaded embeddings shape: {loaded_embeddings.shape}")

# 2. Load FAISS index
loaded_index = faiss.read_index(str(load_dir / 'faiss_index.index'))
print(f"Loaded FAISS index with {loaded_index.ntotal} vectors")

# 3. Load chunks
with open(load_dir / 'chunks.pkl', 'rb') as f:
    loaded_chunks = pickle.load(f)
print(f"Loaded {len(loaded_chunks)} chunks")

print("\nAll data loaded successfully!")


In [None]:
code = """

def bucket_sort(my_list: list, bucket_count: int = 10) -> list:
    ""
    >>> data = [-1, 2, -5, 0]
    >>> bucket_sort(data) == sorted(data)
    True
    >>> data = [9, 8, 7, 6, -12]
    >>> bucket_sort(data) == sorted(data)
    True
    >>> data = [.4, 1.2, .1, .2, -.9]
    >>> bucket_sort(data) == sorted(data)
    True
    >>> bucket_sort([]) == sorted([])
    True
   2, 2, 1, 1, 3]
    >>> bucket_sort(data) == sorted(data)
    True
    >>> data = [5, 5, 5, 5, 5]
"""
search_chunks = [code]
search_embed = generate_embeddings(search_chunks)
faiss.normalize_L2(search_embed)
D, I = index.search(search_embed, k=5)

print(D,I)


[[0.9952494  0.81578714 0.808552   0.8019581  0.792458  ]] [[77 67  0  2 33]]


In [None]:
print(chunks[77])

b'def bucket_sort(my_list: list, bucket_count: int = 10) -> list:\n    """\n    >>> data = [-1, 2, -5, 0]\n    >>> bucket_sort(data) == sorted(data)\n    True\n    >>> data = [9, 8, 7, 6, -12]\n    >>> bucket_sort(data) == sorted(data)\n    True\n    >>> data = [.4, 1.2, .1, .2, -.9]\n    >>> bucket_sort(data) == sorted(data)\n    True\n    >>> bucket_sort([]) == sorted([])\n    True\n    >>> data = [-1e10, 1e10]\n    >>> bucket_sort(data) == sorted(data)\n    True\n    >>> import random\n    >>> collection = random.sample(range(-50, 50), 50)\n    >>> bucket_sort(collection) == sorted(collection)\n    True\n    >>> data = [1, 2, 2, 1, 1, 3]\n    >>> bucket_sort(data) == sorted(data)\n    True\n    >>> data = [5, 5, 5, 5, 5]\n    >>> bucket_sort(data) == sorted(data)\n    True\n    >>> data = [1000, -1000, 500, -500, 0]\n    >>> bucket_sort(data) == sorted(data)\n    True\n    >>> data = [5.5, 2.2, -1.1, 3.3, 0.0]\n    >>> bucket_sort(data) == sorted(data)\n    True\n    >>> bucket_sort

In [None]:
ip_index = faiss.IndexFlatIP(embeddings.shape[1])
ip_index.add(embeddings.copy())  # copy to avoid any accidental mutation
D0, I0 = ip_index.search(search_embed, k=5)
print("FlatIP:", D0, I0)


FlatIP: [[0.4405725 0.4405725 0.4405725 0.4405725 0.4405725]] [[14 13 12 11 10]]
