<a href="https://colab.research.google.com/github/sindegl/sindegl/blob/main/generative_ai_repo_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Creating a list of Github URLs and titles from the generative-ai repo

In [None]:
LOCAL_DIR = "repotree"
!mkdir $LOCAL_DIR
!git clone https://github.com/GoogleCloudPlatform/generative-ai.git $LOCAL_DIR

Cloning into 'repotree'...
remote: Enumerating objects: 1827, done.[K
remote: Counting objects: 100% (1081/1081), done.[K
remote: Compressing objects: 100% (514/514), done.[K
remote: Total 1827 (delta 755), reused 752 (delta 555), pack-reused 746[K
Receiving objects: 100% (1827/1827), 40.48 MiB | 35.80 MiB/s, done.
Resolving deltas: 100% (1047/1047), done.


In [None]:
!tree $LOCAL_DIR

/bin/bash: line 1: tree: command not found


In [None]:
import os
import nbformat

def get_notebook_paths(folder_path: str) -> list[str]:
    """Gets the paths to all Jupyter notebooks in the specified folder.

    Args:
        folder_path (str): The path to the folder to search.

    Returns:
        list[str]: A list of the paths to all Jupyter notebooks in the specified folder.
    """

    all_notebook_paths = []

    try:
        for root, directories, files in os.walk(folder_path):
            file_paths = [os.path.join(root, f) for f in files if f.endswith(".ipynb")]
            all_notebook_paths += file_paths
    except FileNotFoundError:
        print(f"The folder '{folder_path}' does not exist.")

    return all_notebook_paths

In [None]:
notebooks_to_check = get_notebook_paths(LOCAL_DIR)
notebooks_to_check

['repotree/speech/getting-started/speech_recognition.ipynb',
 'repotree/vision/getting-started/visual_question_answering.ipynb',
 'repotree/vision/getting-started/visual_captioning.ipynb',
 'repotree/embeddings/vector-search-quickstart.ipynb',
 'repotree/embeddings/intro-textemb-vectorsearch.ipynb',
 'repotree/embeddings/embedding-similarity-visualization.ipynb',
 'repotree/vector-search/vector-search-quickstart.ipynb',
 'repotree/vector-search/intro-textemb-vectorsearch.ipynb',
 'repotree/vector-search/embedding-similarity-visualization.ipynb',
 'repotree/conversation/data-store-status-checker/data_store_checker.ipynb',
 'repotree/search/search_filters_metadata.ipynb',
 'repotree/search/bulk-question-answering/bulk_question_answering.ipynb',
 'repotree/search/retrieval-augmented-generation/examples/contract_analysis.ipynb',
 'repotree/search/retrieval-augmented-generation/examples/rag_google_documentation.ipynb',
 'repotree/search/retrieval-augmented-generation/examples/question_answe

In [None]:
def get_nb_title(notebook_obj):
    # Check first n cells
    N_CELLS = 3

    # Check for H1 headers in the first n cells
    for i in range(0, N_CELLS):
        cell = notebook_obj.cells[i].source

        # check for H1 headers
        if cell[:11] == "# Copyright":
            pass
        elif cell[:2] == "# ":
            title = cell.split("\n")[0][2:]
            return title

    # If no H1 headers, then check for H2 headers and return as title
    for i in range(0, N_CELLS):
        cell = notebook_obj.cells[i].source

        # check for H2 headers
        if cell[:3] == "## ":
            title = cell.split("\n")[0][3:]
            return title

    # If neither H1 or H2 headers were found, then return "Error"
    return "Error"

In [None]:
result = []

for notebook_path in notebooks_to_check:
    gh_url = notebook_path.replace(LOCAL_DIR, "https://github.com/GoogleCloudPlatform/generative-ai/blob/main/")

    # Read the notebook, version 4 is the latest working version at the moment
    notebook_obj = nbformat.read(notebook_path, as_version=4)

    title = get_nb_title(notebook_obj)

    result.append([gh_url, title])

In [None]:
import pandas as pd
df = pd.DataFrame(result, columns = ["Github URL", "Title"])
df

Unnamed: 0,Github URL,Title
0,https://github.com/GoogleCloudPlatform/generat...,Get started with Chirp on Google Cloud
1,https://github.com/GoogleCloudPlatform/generat...,Visual Question Answering (VQA) with Imagen on...
2,https://github.com/GoogleCloudPlatform/generat...,Visual captioning with Imagen on Vertex AI
3,https://github.com/GoogleCloudPlatform/generat...,Vertex AI Vector Search Quickstart
4,https://github.com/GoogleCloudPlatform/generat...,Getting Started with Text Embeddings + Vertex ...
5,https://github.com/GoogleCloudPlatform/generat...,Visualizing embedding similarity from text doc...
6,https://github.com/GoogleCloudPlatform/generat...,Vertex AI Vector Search Quickstart
7,https://github.com/GoogleCloudPlatform/generat...,Getting Started with Text Embeddings + Vertex ...
8,https://github.com/GoogleCloudPlatform/generat...,Visualizing embedding similarity from text doc...
9,https://github.com/GoogleCloudPlatform/generat...,Vertex AI Search and Conversation Data Store S...


In [None]:
df.to_csv("generative-ai_notebooks.csv", index=False)