In [1]:
import os
import requests
import json

def fetch_repo_content(repo_owner, repo_name, path=''):
    api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{path}"
    headers = {'Accept': 'application/vnd.github.v3+json'}
    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch repo content: {e}")
        return None

def download_file(file_url, local_file_path):
    try:
        response = requests.get(file_url)
        response.raise_for_status()
        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
        with open(local_file_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {file_url} to {local_file_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download file {file_url}: {e}")

def process_github_repo(repo_owner, repo_name, metadata_output_path):
    contents = fetch_repo_content(repo_owner, repo_name)
    if contents is None:
        return

    base_raw_url = f"https://raw.githubusercontent.com/{repo_owner}/{repo_name}/main"
    base_main_github_url = f"https://github.com/{repo_owner}/{repo_name}/blob/main"
    metadata = {}

    stack = [('', contents)]
    while stack:
        current_path, directory_contents = stack.pop()
        for item in directory_contents:
            if item['type'] == 'file' and item['name'].endswith('.ipynb'):
                file_path = item['path']
                raw_file_url = f"{base_raw_url}/{file_path.replace(' ', '%20')}"
                github_url = f"{base_main_github_url}/{file_path.replace(' ', '%20')}"
                local_file_path = os.path.join("downloaded_files", file_path)

                # Download the notebook
                download_file(raw_file_url, local_file_path)

                # Store both raw and main GitHub URLs in metadata
                metadata[local_file_path] = {
                    "raw_url": raw_file_url,
                    "github_url": github_url
                }
            
            elif item['type'] == 'dir':
                # Fetch contents of the directory
                sub_dir_contents = fetch_repo_content(repo_owner, repo_name, item['path'])
                if sub_dir_contents:
                    stack.append((item['path'], sub_dir_contents))

    # Save metadata to a JSON file
    with open(metadata_output_path, 'w', encoding='utf-8') as metadata_file:
        json.dump(metadata, metadata_file, ensure_ascii=False, indent=4)

    print(f"Metadata saved to {metadata_output_path}")

# Usage
repo_owner = 'pytopia'
repo_name = 'Python-Programming'
metadata_output_file = 'notebook_metadata.json'
process_github_repo(repo_owner, repo_name, metadata_output_file)


Downloaded: https://raw.githubusercontent.com/pytopia/Python-Programming/main/Lectures/12%20Capstone%20Project/01%20Project%20Proposal.ipynb to downloaded_files/Lectures/12 Capstone Project/01 Project Proposal.ipynb
Downloaded: https://raw.githubusercontent.com/pytopia/Python-Programming/main/Lectures/12%20Capstone%20Project/02%20Requirements%20Gathering.ipynb to downloaded_files/Lectures/12 Capstone Project/02 Requirements Gathering.ipynb
Downloaded: https://raw.githubusercontent.com/pytopia/Python-Programming/main/Lectures/12%20Capstone%20Project/03%20Design%20and%20Planning.ipynb to downloaded_files/Lectures/12 Capstone Project/03 Design and Planning.ipynb
Downloaded: https://raw.githubusercontent.com/pytopia/Python-Programming/main/Lectures/12%20Capstone%20Project/04%20Setting%20Up%20the%20Development%20Environment.ipynb to downloaded_files/Lectures/12 Capstone Project/04 Setting Up the Development Environment.ipynb
Downloaded: https://raw.githubusercontent.com/pytopia/Python-Progr

In [5]:
import json

def read_json_file(file_path):
    """
    Reads JSON data from a file and returns it as a Python dictionary.

    :param file_path: Path to the JSON file.
    :return: Parsed JSON data as a dictionary or list.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as json_file:
            data = json.load(json_file)
            return data
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except json.JSONDecodeError:
        print(f"Error: The file {file_path} contains invalid JSON.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")




In [6]:

notebook_metadata = read_json_file('notebook_metadata.json')

In [9]:
notebook_metadata

{'downloaded_files/Lectures/12 Capstone Project/01 Project Proposal.ipynb': {'raw_url': 'https://raw.githubusercontent.com/pytopia/Python-Programming/main/Lectures/12%20Capstone%20Project/01%20Project%20Proposal.ipynb',
  'github_url': 'https://github.com/pytopia/Python-Programming/blob/main/Lectures/12%20Capstone%20Project/01%20Project%20Proposal.ipynb'},
 'downloaded_files/Lectures/12 Capstone Project/02 Requirements Gathering.ipynb': {'raw_url': 'https://raw.githubusercontent.com/pytopia/Python-Programming/main/Lectures/12%20Capstone%20Project/02%20Requirements%20Gathering.ipynb',
  'github_url': 'https://github.com/pytopia/Python-Programming/blob/main/Lectures/12%20Capstone%20Project/02%20Requirements%20Gathering.ipynb'},
 'downloaded_files/Lectures/12 Capstone Project/03 Design and Planning.ipynb': {'raw_url': 'https://raw.githubusercontent.com/pytopia/Python-Programming/main/Lectures/12%20Capstone%20Project/03%20Design%20and%20Planning.ipynb',
  'github_url': 'https://github.com/

In [11]:

import os
import json
import nbformat
import re

def clean_toc(content):
    """
    Cleans up the "Table of Contents" content by extracting plain text headings.
    
    Parameters:
    - content (str): The raw markdown content of the TOC cell.
    
    Returns:
    - list of str: A list containing the plain text titles of each TOC entry.
    """
    lines = content.splitlines()
    cleaned_lines = []
    
    # Regular expression to find markdown list items with links
    toc_regex = re.compile(r'^\s*[-*]\s*\[(.*?)\]\(.*?\)\s*$')
    
    for line in lines:
        match = toc_regex.match(line)
        if match:
            cleaned_lines.append(match.group(1))
    
    return cleaned_lines

def clean_exercise(content):
    """
    Cleans the "Practice Exercise" content by removing markdown syntax and excess whitespace.
    
    Parameters:
    - content (str): The raw content of the exercise cell.
    
    Returns:
    - str: Cleaned content as plain text.
    """
    # Remove markdown links and other formatting
    cleaned_content = re.sub(r'\[.*?\]\(.*?\)', '', content)  # Remove markdown links
    cleaned_content = re.sub(r'^\s*[-*]\s*', '', cleaned_content, flags=re.MULTILINE)  # Remove list markers
    cleaned_content = re.sub(r'\s+', ' ', cleaned_content).strip()  # Reduce whitespace
    return cleaned_content

def extract_cells(directory, json_output_path, repo_url):
    """
    Extracts "Table of Contents" and "Practice Exercise" cells from all Jupyter notebooks in a directory,
    cleans them, and saves the information to a JSON file.
    
    Parameters:
    - directory (str): The directory path to search for Jupyter notebooks.
    - json_output_path (str): The path to save the output JSON file.
    - repo_url (str): The repository URL to include in the JSON output.
    """
    notebook_data = []

    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".ipynb"):
                notebook_path = os.path.join(root, file)

                try:
                    with open(notebook_path, 'r', encoding='utf-8') as nb_file:
                        nb = nbformat.read(nb_file, as_version=4)

                        toc_content = ""
                        practice_exercises = []
                        capturing_exercise = False

                        for cell in nb.cells:
                            if cell.cell_type == 'markdown':
                                cell_text = cell.source.lower()

                                if "table of contents" in cell_text:
                                    toc_content = clean_toc(cell.source)
                                elif "practice exercise" in cell_text or "Exercise:" in cell_text:
                                    capturing_exercise = True
                                    practice_exercises.append(clean_exercise(cell.source))
                                elif capturing_exercise and re.match(r'^\s*#', cell.source):
                                    capturing_exercise = False

                            if capturing_exercise:
                                practice_exercises.append(clean_exercise(cell.source))

                        notebook_data.append({
                            "local_path": notebook_path,
                            "name": os.path.splitext(file)[0],
                            "table_of_content": toc_content,
                            "practice_exercises": practice_exercises,
                            "github_link": notebook_metadata[notebook_path]['github_url']
                        })
                except (nbformat.reader.NotJSONError, json.JSONDecodeError) as e:
                    print(f"Error reading {notebook_path}: {e}")

    with open(json_output_path, 'w', encoding='utf-8') as json_file:
        json.dump(notebook_data, json_file, ensure_ascii=False, indent=4)
    
    print(f"Notebook data saved to {json_output_path}")

if __name__ == "__main__":
    directory_to_search = "downloaded_files"  # Change this to your directory
    output_json_file = "notebooks_summary.json"
    repository_url = "https://github.com/pytopia/Python-Programming"  # Replace with your repository URL

    extract_cells(directory_to_search, output_json_file, repository_url)


Error reading downloaded_files/Lectures/11 Advanced Topics/-- Advanced Composite Data Types.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/-- Bitwise Operators.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/01 List Memory Management.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/02 Dictionary and Set Hash Table.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/03 Variable-length Arguments (*args and **kwargs).ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/04 Namespaces in Python.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/05 Variable Scope.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced T

In [12]:
notebooks_summary = read_json_file('notebooks_summary.json')

In [27]:
notebooks_summary[41]

{'local_path': 'downloaded_files/Lectures/04 Data Types/22 Frozensets.ipynb',
 'name': '22 Frozensets',
 'table_of_content': ['Creating Frozen Sets in Python',
  'Syntax for Creating a `frozenset`',
  'Converting Other Iterables into Frozen Sets',
  'Unique Elements in Frozen Sets',
  'Immutable Sets vs. Mutable Sets',
  'Conclusion',
  'Accessing Frozen Set Elements',
  'Membership Testing',
  'Unordered Nature',
  'Conversion to Ordered Sequence',
  'Conclusion',
  'Operations on Frozen Sets in Python',
  'Immutable Nature and Implications',
  'Set Operations with `frozenset`',
  'Using Operators with `frozenset`',
  'Combining `frozenset` with Other Sets',
  'Immutable Results',
  'Conclusion',
  'Practical Applications of Frozen Sets',
  'Frozen Sets as Dictionary Keys',
  'Frozen Sets in Other Data Structures',
  'Frozen Sets for Constant Set Definitions',
  'Frozen Sets for Safe Data Sharing',
  'Conclusion',
  'Performance Considerations for Frozen Sets',
  'Conclusion',
  'Conc