In [1]:
import os
import nbformat
import json

In [2]:
def get_cells_by_keyword(notebook_path, keywords):
    with open(notebook_path, 'r', encoding='utf-8') as f:
        notebook = nbformat.read(f, as_version=4)
    
    extracted_data = {keyword: "" for keyword in keywords}
    
    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            for keyword in keywords:
                if keyword in cell.source:
                    extracted_data[keyword] = cell.source.replace("\n", " ")
    return extracted_data


In [3]:
def create_json_structure(base_directory):
    json_result = []
    keywords = ["Table of Content", "Exercises"]

    for root, dirs, files in os.walk(base_directory):
        for file in files:
            if file.endswith('.ipynb'):
                notebook_path = os.path.join(root, file)
                extracted_data = get_cells_by_keyword(notebook_path, keywords)
                
                notebook_info = {
                    "local_path": notebook_path,
                    "github_link": "",  # Add logic to generate GitHub links if needed
                    "name": os.path.splitext(file)[0],
                    "table_of_content": extracted_data["Table of Content"],
                    "exercises": extracted_data["Exercises"]
                }
                
                json_result.append(notebook_info)

    return json_result

In [4]:

def save_json_to_file(data, output_filename):
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [5]:

if __name__ == "__main__":
    base_directory = '/mnt/c/Users/user/OneDrive/Desktop/github-projects/advanced-github-search/notebooks/08 File Handling'  # Change this to the directory containing notebooks
    output_filename = 'notebooks_summary.json'
    
    notebooks_json = create_json_structure(base_directory)
    save_json_to_file(notebooks_json, output_filename)

In [6]:
import os
import json
import nbformat
import re

def clean_content(content, cell_type):
    """
    Cleans up content by extracting the text based on the cell type (TOC or Exercises).
    
    Parameters:
    - content (str): The raw markdown content of the cell.
    - cell_type (str): The type of cell content to clean ("toc" or "exercises").
    
    Returns:
    - list of str: A list containing the cleaned text of the specified cell type.
    """
    lines = content.splitlines()
    cleaned_lines = []

    if cell_type == "toc":
        # Pattern to capture headings in a markdown list for TOC
        toc_regex = re.compile(r'^\s*[-*]\s*\[(.*?)\]\(.+\)\s*$')

        for line in lines:
            match = toc_regex.match(line)
            if match:
                cleaned_lines.append(match.group(1))
    elif cell_type == "exercises":
        # Simply return non-empty lines as a list for exercises
        cleaned_lines = [line for line in lines if line.strip()]

    return cleaned_lines

def extract_cells(directory, json_output_path, repo_url):
    """
    Extracts "Table of Contents" and "Exercises" cells from all Jupyter notebooks in a directory,
    cleans them, and saves the information to a JSON file.
    
    Parameters:
    - directory (str): The directory path to search for Jupyter notebooks.
    - json_output_path (str): The path to save the output JSON file.
    - repo_url (str): The repository URL to include in the JSON output.
    """
    notebook_data = []

    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".ipynb"):
                notebook_path = os.path.join(root, file)

                try:
                    with open(notebook_path, 'r', encoding='utf-8') as nb_file:
                        nb = nbformat.read(nb_file, as_version=4)

                        toc_content = ""
                        exercises_content = ""

                        for cell in nb.cells:
                            if cell.cell_type == 'markdown':
                                cell_text = cell.source.lower()

                                if "table of contents" in cell_text:
                                    toc_content = clean_content(cell.source, "toc")
                                elif "practice exercise" in cell_text:
                                    exercises_content = clean_content(cell.source, "exercises")

                        notebook_data.append({
                            "local_path": notebook_path,
                            "name": os.path.splitext(file)[0],
                            "table_of_content": toc_content,
                            "exercises": exercises_content,
                            "github_link": os.path.join(repo_url, os.path.relpath(notebook_path, directory))
                        })
                except (nbformat.reader.NotJSONError, json.JSONDecodeError) as e:
                    print(f"Error reading {notebook_path}: {e}")

    with open(json_output_path, 'w', encoding='utf-8') as json_file:
        json.dump(notebook_data, json_file, ensure_ascii=False, indent=4)
    
    print(f"Notebook data saved to {json_output_path}")

if __name__ == "__main__":
    directory_to_search = "downloaded_files"  # Directory with notebooks
    output_json_file = "notebooks_summary.json"  # Output JSON file
    repository_url = "https://github.com/pytopia/Python-Programming"  # Base repo URL
    
    extract_cells(directory_to_search, output_json_file, repository_url)


Notebook data saved to notebooks_summary.json


In [8]:
import os
import json
import nbformat

def clean_content(content):
    """
    Cleans and returns the text of notebook cells.
    
    Parameters:
    - content (str): The raw content of a notebook cell.

    Returns:
    - str: Cleaned content as plain text.
    """
    return content.strip()

def extract_cells(directory, json_output_path, repo_url):
    """
    Extracts "Table of Contents" and "Practice Exercises" cells and their details from Jupyter notebooks,
    and saves the information to a JSON file.
    
    Parameters:
    - directory (str): Directory path to search for Jupyter notebooks.
    - json_output_path (str): Path to save the output JSON file.
    - repo_url (str): Base repository URL for GitHub links.
    """
    notebook_data = []

    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".ipynb"):
                notebook_path = os.path.join(root, file)

                try:
                    with open(notebook_path, 'r', encoding='utf-8') as nb_file:
                        nb = nbformat.read(nb_file, as_version=4)

                        toc_content = ""
                        practice_exercises = []
                        capturing_practice_exercise = False

                        for cell in nb.cells:
                            if cell.cell_type == 'markdown':
                                cell_text = cell.source.lower()

                                if "table of contents" in cell_text:
                                    toc_content = clean_content(cell.source)
                                elif "practice exercise" in cell_text:
                                    # Start capturing from here
                                    capturing_practice_exercise = True
                                    practice_exercises.append(clean_content(cell.source))
                                elif capturing_practice_exercise and "##" in cell_text:
                                    # Stop capturing if a new section starts
                                    capturing_practice_exercise = False

                            # Continue capturing practice exercises if the flag is set
                            elif capturing_practice_exercise:
                                practice_exercises.append(clean_content(cell.source))

                        notebook_data.append({
                            "local_path": notebook_path,
                            "name": os.path.splitext(file)[0],
                            "table_of_content": toc_content,
                            "practice_exercises": practice_exercises,
                            "github_link": os.path.join(repo_url, os.path.relpath(notebook_path, directory))
                        })
                except (nbformat.reader.NotJSONError, json.JSONDecodeError) as e:
                    print(f"Error reading {notebook_path}: {e}")

    with open(json_output_path, 'w', encoding='utf-8') as json_file:
        json.dump(notebook_data, json_file, ensure_ascii=False, indent=4)
    
    print(f"Notebook data saved to {json_output_path}")

if __name__ == "__main__":
    directory_to_search = "downloaded_files"  # Change this to your directory
    output_json_file = "notebooks_summary.json"
    repository_url = "https://github.com/pytopia/Python-Programming"  # Replace with your repository URL

    extract_cells(directory_to_search, output_json_file, repository_url)


Notebook data saved to notebooks_summary.json


In [9]:
import os
import json
import nbformat
import re

def clean_content(content):
    """
    Cleans and returns the text of notebook cells.
    
    Parameters:
    - content (str): The raw content of a notebook cell.

    Returns:
    - str: Cleaned content as plain text.
    """
    return content.strip()

def extract_cells(directory, json_output_path, repo_url):
    """
    Extracts "Table of Content" and "Practice Exercise" cells from all Jupyter notebooks in a directory,
    cleans them, and saves the information to a JSON file.
    
    Parameters:
    - directory (str): The directory path to search for Jupyter notebooks.
    - json_output_path (str): The path to save the output JSON file.
    - repo_url (str): The repository URL to include in the JSON output.
    """
    notebook_data = []

    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".ipynb"):
                notebook_path = os.path.join(root, file)

                try:
                    with open(notebook_path, 'r', encoding='utf-8') as nb_file:
                        nb = nbformat.read(nb_file, as_version=4)

                        toc_content = ""
                        practice_exercises = []
                        capturing_practice_exercise = False

                        for cell in nb.cells:
                            if cell.cell_type == 'markdown':
                                cell_text = cell.source.lower()

                                if "table of contents" in cell_text:
                                    toc_content = clean_content(cell.source)
                                elif "practice exercise" in cell_text:
                                    # Start capturing related cells
                                    capturing_practice_exercise = True
                                    practice_exercises.append(clean_content(cell.source))
                                elif capturing_practice_exercise and re.match(r'^\s*#', cell.source):
                                    # Stop capturing if we hit a new distinct markdown section header
                                    capturing_practice_exercise = False

                            if capturing_practice_exercise:
                                # Append all subsequent cells content, both markdown and code
                                practice_exercises.append(clean_content(cell.source))

                        notebook_data.append({
                            "local_path": notebook_path,
                            "name": os.path.splitext(file)[0],
                            "table_of_content": toc_content,
                            "practice_exercises": practice_exercises,
                            "github_link": os.path.join(repo_url, os.path.relpath(notebook_path, directory))
                        })
                except (nbformat.reader.NotJSONError, json.JSONDecodeError) as e:
                    print(f"Error reading {notebook_path}: {e}")

    with open(json_output_path, 'w', encoding='utf-8') as json_file:
        json.dump(notebook_data, json_file, ensure_ascii=False, indent=4)

    print(f"Notebook data saved to {json_output_path}")

if __name__ == "__main__":
    directory_to_search = "downloaded_files"  # Change this to your directory
    output_json_file = "notebooks_summary.json"
    repository_url = "https://github.com/pytopia/Python-Programming"  # Replace with your repository URL

    extract_cells(directory_to_search, output_json_file, repository_url)


Notebook data saved to notebooks_summary.json


In [10]:
import os
import json
import nbformat
import re

def clean_content(content):
    """
    Cleans and returns the text of notebook cells by removing markdown-specific elements and excessive whitespace.
    
    Parameters:
    - content (str): The raw content of a notebook cell.

    Returns:
    - str: Cleaned content as plain text.
    """
    # Remove redundant whitespace and line breaks
    cleaned_content = re.sub(r'\s+', ' ', content).strip()
    # Optionally, remove markdown syntax if needed, e.g., links or list markers
    cleaned_content = re.sub(r'\[.*?\]\(.*?\)', '', cleaned_content)  # removing markdown links
    # Remove bullet points or dashes used in lists
    cleaned_content = re.sub(r'^[-*]\s*', '', cleaned_content, flags=re.MULTILINE)
    return cleaned_content

def extract_cells(directory, json_output_path, repo_url):
    """
    Extracts "Table of Contents" and "Practice Exercise" cells from all Jupyter notebooks in a directory,
    cleans them, and saves the information to a JSON file.
    
    Parameters:
    - directory (str): The directory path to search for Jupyter notebooks.
    - json_output_path (str): The path to save the output JSON file.
    - repo_url (str): The repository URL to include in the JSON output.
    """
    notebook_data = []

    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".ipynb"):
                notebook_path = os.path.join(root, file)

                try:
                    with open(notebook_path, 'r', encoding='utf-8') as nb_file:
                        nb = nbformat.read(nb_file, as_version=4)

                        toc_content = ""
                        practice_exercises = []
                        capturing_practice_exercise = False

                        for cell in nb.cells:
                            if cell.cell_type == 'markdown':
                                cell_text = cell.source.lower()

                                if "table of contents" in cell_text:
                                    toc_content = clean_content(cell.source)
                                elif "practice exercise" in cell_text:
                                    # Start capturing related cells
                                    capturing_practice_exercise = True
                                    practice_exercises.append(clean_content(cell.source))
                                elif capturing_practice_exercise and re.match(r'^\s*#', cell.source):
                                    # Stop capturing if we hit a new distinct markdown section header
                                    capturing_practice_exercise = False

                            if capturing_practice_exercise:
                                # Append all subsequent cells content, both markdown and code, cleaned
                                practice_exercises.append(clean_content(cell.source))

                        notebook_data.append({
                            "local_path": notebook_path,
                            "name": os.path.splitext(file)[0],
                            "table_of_content": toc_content,
                            "practice_exercises": practice_exercises,
                            "github_link": os.path.join(repo_url, os.path.relpath(notebook_path, directory))
                        })
                except (nbformat.reader.NotJSONError, json.JSONDecodeError) as e:
                    print(f"Error reading {notebook_path}: {e}")

    with open(json_output_path, 'w', encoding='utf-8') as json_file:
        json.dump(notebook_data, json_file, ensure_ascii=False, indent=4)

    print(f"Notebook data saved to {json_output_path}")

if __name__ == "__main__":
    directory_to_search = "downloaded_files"  # Change this to your directory
    output_json_file = "notebooks_summary.json"
    repository_url = "https://github.com/pytopia/Python-Programming"  # Replace with your repository URL

    extract_cells(directory_to_search, output_json_file, repository_url)


Notebook data saved to notebooks_summary.json


In [2]:
import os
import json
import nbformat
import re

def clean_toc(content):
    """
    Cleans up the "Table of Contents" content by extracting plain text headings.
    
    Parameters:
    - content (str): The raw markdown content of the TOC cell.
    
    Returns:
    - list of str: A list containing the plain text titles of each TOC entry.
    """
    lines = content.splitlines()
    cleaned_lines = []
    
    # Regular expression to find markdown list items with links
    toc_regex = re.compile(r'^\s*[-*]\s*\[(.*?)\]\(.*?\)\s*$')
    
    for line in lines:
        match = toc_regex.match(line)
        if match:
            cleaned_lines.append(match.group(1))
    
    return cleaned_lines

def clean_exercise(content):
    """
    Cleans the "Practice Exercise" content by removing markdown syntax and excess whitespace.
    
    Parameters:
    - content (str): The raw content of the exercise cell.
    
    Returns:
    - str: Cleaned content as plain text.
    """
    # Remove markdown links and other formatting
    cleaned_content = re.sub(r'\[.*?\]\(.*?\)', '', content)  # Remove markdown links
    cleaned_content = re.sub(r'^\s*[-*]\s*', '', cleaned_content, flags=re.MULTILINE)  # Remove list markers
    cleaned_content = re.sub(r'\s+', ' ', cleaned_content).strip()  # Reduce whitespace
    return cleaned_content

def extract_cells(directory, json_output_path, repo_url):
    """
    Extracts "Table of Contents" and "Practice Exercise" cells from all Jupyter notebooks in a directory,
    cleans them, and saves the information to a JSON file.
    
    Parameters:
    - directory (str): The directory path to search for Jupyter notebooks.
    - json_output_path (str): The path to save the output JSON file.
    - repo_url (str): The repository URL to include in the JSON output.
    """
    notebook_data = []

    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".ipynb"):
                notebook_path = os.path.join(root, file)

                try:
                    with open(notebook_path, 'r', encoding='utf-8') as nb_file:
                        nb = nbformat.read(nb_file, as_version=4)

                        toc_content = ""
                        practice_exercises = []
                        capturing_exercise = False

                        for cell in nb.cells:
                            if cell.cell_type == 'markdown':
                                cell_text = cell.source.lower()

                                if "table of contents" in cell_text:
                                    toc_content = clean_toc(cell.source)
                                elif "practice exercise" in cell_text or "Exercise:" in cell_text:
                                    capturing_exercise = True
                                    practice_exercises.append(clean_exercise(cell.source))
                                elif capturing_exercise and re.match(r'^\s*#', cell.source):
                                    capturing_exercise = False

                            if capturing_exercise:
                                practice_exercises.append(clean_exercise(cell.source))

                        notebook_data.append({
                            "local_path": notebook_path,
                            "name": os.path.splitext(file)[0],
                            "table_of_content": toc_content,
                            "practice_exercises": practice_exercises,
                            "github_link": os.path.join(repo_url, os.path.relpath(notebook_path, directory))
                        })
                except (nbformat.reader.NotJSONError, json.JSONDecodeError) as e:
                    print(f"Error reading {notebook_path}: {e}")

    with open(json_output_path, 'w', encoding='utf-8') as json_file:
        json.dump(notebook_data, json_file, ensure_ascii=False, indent=4)
    
    print(f"Notebook data saved to {json_output_path}")

if __name__ == "__main__":
    directory_to_search = "downloaded_files"  # Change this to your directory
    output_json_file = "notebooks_summary.json"
    repository_url = "https://github.com/pytopia/Python-Programming"  # Replace with your repository URL

    extract_cells(directory_to_search, output_json_file, repository_url)


Error reading downloaded_files/Lectures/11 Advanced Topics/-- Advanced Composite Data Types.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/-- Bitwise Operators.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/01 List Memory Management.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/02 Dictionary and Set Hash Table.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/03 Variable-length Arguments (*args and **kwargs).ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/04 Namespaces in Python.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced Topics/05 Variable Scope.ipynb: Notebook does not appear to be JSON: ''
Error reading downloaded_files/Lectures/11 Advanced T