# Main Workflow

In [1]:
from time import time

init = time()

import re
import os
import sys
import json
import yaml
from functools import reduce
from collections import ChainMap
import subprocess

import pandas as pd
from glob import glob
import nbformat

import jax

## Load old colab notebook names

In [2]:
old_nb_files = glob("../../pml-book/pml1/figure_notebooks/*")
old_nb_files[:2]

['../../pml-book/pml1/figure_notebooks/chapter10_logistic_regression_figures.ipynb',
 '../../pml-book/pml1/figure_notebooks/chapter4_statistics_figures.ipynb']

## Parse script names from colab notebooks

In [3]:
new_nb_path = "../notebooks/book1/"
scripts_path = "../scripts/"

In [4]:
def get_fig_wise_scripts(cells):
    prev_cell, cell = cells
    scripts = re.findall("\[(\S*?\.py)\]\(http", cell["source"])

    if scripts:
        fig_num = re.findall("## Figure (.*?):", prev_cell["source"])[0]
        fig_num = ".".join([fig_num.split(".")[0].zfill(2), fig_num.split(".")[1].zfill(2)])
        return {fig_num: scripts}


def process_notebook(file_name):
    chap_num, chap_name = file_name.split("/")[-1].split(".")[0].split("_", 1)
    chap_num = chap_num.replace("chapter", "").zfill(2)
    chap_name = chap_name.replace("_figures", "")
    nb = nbformat.read(file_name, as_version=4)

    scripts = map(get_fig_wise_scripts, zip(nb["cells"], nb["cells"][1:]))
    scripts = filter(None, scripts)
    # https://stackoverflow.com/a/15714097
    scripts = reduce(lambda x, y: x.update(y) or x, scripts, {})
    return {f"{chap_num}_{chap_name}": scripts}


master_metadata = map(process_notebook, old_nb_files)
master_metadata = reduce(lambda x, y: x.update(y) or x, master_metadata, {})

scripts = list(set(jax.tree_leaves(master_metadata)))
print(f"Found {len(set(scripts))} unique scripts")

# Check appendix to see full output mapping

Found 164 unique scripts


## Process the code

Ways to import modules in python
* `import foo`
* `import foo as bar`
* `import foo.bar`
* `import foo.bar as bar`
* `from foo import bar`
* `from foo import *`
* `from foo.bar import baz`
* `from foo.bar import baz as qux`

In [5]:
def get_module(line):
    line = line.rstrip()
    import_kw = None

    if line.lstrip().startswith("import "):
        import_kw = "import "
    elif line.lstrip().startswith("from "):
        import_kw = "from "

    if import_kw:
        module = line.lstrip()[len(import_kw) :].split(" ")[0].split(".")[0]
        return module, import_kw
    return (None, None)


def get_modules_from_script(file_name):
    try:
        with open(os.path.join(scripts_path, file_name)) as f:
            code = f.read()
        codelines = code.split("\n")
        modules = set(filter(None, map(lambda x: get_module(x)[0], codelines)))
        return modules
    except FileNotFoundError:
        print(f"{file_name} not found")

In [6]:
INBUILT_MODULES = [
    "__future__",
    "collections",
    "functools",
    "io",
    "itertools",
    "math",
    "os",
    "pathlib",
    "pprint",
    "random",
    "sys",
    "time",
    "timeit",
    "warnings",
    "mpl_toolkits",
]
REMOVE_MODULES = ["superimport"]
SCRIPT_MODULES = [
    "rvm_regressor",
    "gmm_lib",
    "rvm_classifier",
    "gauss_utils",
    "prefit_voting_classifier",
    "mix_bernoulli_lib",
    "fisher_lda_fit",
]
TRANSFORM_MODULES = {"PIL": "pillow", "tensorflow_probability": "tensorflow-probability", "sklearn": "scikit-learn"}
with open("../requirements.txt") as f:
    REQ_MODULES = f.read().strip().split("\n")
# TODO: Replace import pyprobml_utils with probml_utils

#### Corrected scripts:
* Figure 2.5: typo_fix: changed anscobmes_quartet.py to anscombes_quartet.py
* Figure 3.13: name_change: changed mix_ber_em_mnist.py to mix_bernoulli_em_mnist.py
* Figure 4.17: missing: gaussInferParamsMean2d.py is not present in scripts folder (changed to gauss_infer_2d.py)
* Figure 9.5: name_change: changed fisher_vowel_demo.py to fisher_discrim_vowel.py

In [7]:
all_modules = reduce(
    lambda x, y: x.union(y) or x, filter(None, map(get_modules_from_script, jax.tree_leaves(master_metadata)))
)

In [8]:
check_modules = all_modules - set(INBUILT_MODULES) - set(SCRIPT_MODULES) - set(REQ_MODULES) - set(REMOVE_MODULES)

In [9]:
for module in check_modules:
    try:
        if module in TRANSFORM_MODULES:
            module_install = TRANSFORM_MODULES[module]
        else:
            module_install = module
        exec(f"import {module}")
    except Exception as e:
        print(e)
        print(module, "failed")

No module named 'pyprobml_utils'
pyprobml_utils failed


In [10]:
def get_white_space(line):
    space = 0
    while line[0] == " ":
        line = line[1:]
        space += 1
    return space * " "


def convert_py_to_ipynb(file_name, chapter, fig_num, prev=""):
    chap_num, _ = chapter.split("_", 1)
    current_modules = set()
    new_lines = []
    notebook = nbformat.v4.new_notebook()

    with open(os.path.join(scripts_path, file_name)) as f:
        code = f.read().strip()
    codelines = code.split("\n")
    for line in codelines:
        # Ignore superimport
        if line.strip().startswith("import superimport"):
            continue

        # consistently use savefig only
        line = line.replace("save_fig", "savefig")

        # change folder path
        line = line.replace("../figures", "figures")

        # Change pyprobml_utils to probml_utils
        if "pyprobml_utils" in line:
            line = line.replace("pyprobml_utils", "probml_utils")
            current_modules.add("probml_utils")

        # Check if the line is an import command
        module, import_kw = get_module(line)
        if module:
            if module in SCRIPT_MODULES:
                if import_kw == "import ":
                    if " as " in line:
                        line = line.replace(f"{module}", f"probml_utils.{module}", 1)
                    else:
                        line = line.replace(f"{module}", f"probml_utils.{module} as {module}", 1)
                elif import_kw == "from ":
                    line = line.replace(f"{module}", f"probml_utils.{module}", 1)
                else:
                    raise NameError()
            elif module not in INBUILT_MODULES + REQ_MODULES + list(current_modules):
                current_modules.add(module)
                module_install = TRANSFORM_MODULES[module] if module in TRANSFORM_MODULES else module
                space = get_white_space(line)
                line = f"{space}try:\n    {space}{line}\n{space}except ModuleNotFoundError:\n    {space}%pip install {module_install}\n    {space}{line}"

        new_lines.append(line)
    new_code = "\n".join(new_lines) + "\n"
    if len(prev) == 0:
        notebook["cells"].append(nbformat.v4.new_code_cell(new_code))
    else:
        notebook["cells"].append(nbformat.v4.new_markdown_cell(prev))

    save_path = f"../notebooks/book1/{chap_num}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    nbformat.write(notebook, os.path.join(save_path, f"{file_name.replace('.py', '.ipynb')}"))
    print(f"{file_name} saved")

## Convert

In [11]:
global_store = []
global_chap = []
repo_path = "https://github.com/probml/pyprobml/tree/master/notebooks/book1"
for chapter in sorted(master_metadata):
    local_store = []
    for fig_num, script_names in master_metadata[chapter].items():
        for script_name in script_names:
            print(f"Processing: chapter {chapter}, figure {fig_num}, script_name {script_name}")
            prev = ""
            if script_name in global_store:
                if script_name not in local_store:
                    idx = global_store.index(script_name)
                    chap_num = global_chap[idx]
                    prev = f"Source of this notebook is here:  {repo_path}/{chap_num}/{script_name.replace('.py', '.ipynb')}"
                    print("##### PREV triggered. duplicate of", chap_num, script_name)
            else:
                global_store.append(script_name)
                global_chap.append(chapter.split("_", 1)[0])
                local_store.append(script_name)

            convert_py_to_ipynb(script_name, chapter, fig_num, prev)

Processing: chapter 01_introduction, figure 01.03, script_name iris_plot.py
iris_plot.py saved
Processing: chapter 01_introduction, figure 01.05, script_name linreg_residuals_plot.py
linreg_residuals_plot.py saved
Processing: chapter 01_introduction, figure 01.06, script_name linreg_2d_surface_demo.py
linreg_2d_surface_demo.py saved
Processing: chapter 01_introduction, figure 01.07, script_name linreg_poly_vs_degree.py
linreg_poly_vs_degree.py saved
Processing: chapter 01_introduction, figure 01.08, script_name iris_kmeans.py
iris_kmeans.py saved
Processing: chapter 01_introduction, figure 01.09, script_name iris_pca.py
iris_pca.py saved
Processing: chapter 01_introduction, figure 01.12, script_name mnist_viz_tf.py
mnist_viz_tf.py saved
Processing: chapter 01_introduction, figure 01.12, script_name emnist_viz_pytorch.py
emnist_viz_pytorch.py saved
Processing: chapter 01_introduction, figure 01.13, script_name fashion_viz_tf.py
fashion_viz_tf.py saved
Processing: chapter 01_introduction

discrim_analysis_dboundaries_plot2.py saved
Processing: chapter 09_linear_discriminant_analysis, figure 09.04, script_name fisher_lda_demo.py
fisher_lda_demo.py saved
Processing: chapter 09_linear_discriminant_analysis, figure 09.05, script_name fisher_discrim_vowel.py
fisher_discrim_vowel.py saved
Processing: chapter 09_linear_discriminant_analysis, figure 09.08, script_name generativeVsDiscrim.py
generativeVsDiscrim.py saved
Processing: chapter 10_logistic_regression, figure 10.01, script_name iris_logreg.py
##### PREV triggered. duplicate of 02 iris_logreg.py
iris_logreg.py saved
Processing: chapter 10_logistic_regression, figure 10.02, script_name sigmoid_2d_plot.py
sigmoid_2d_plot.py saved
Processing: chapter 10_logistic_regression, figure 10.04, script_name logreg_poly_demo.py
logreg_poly_demo.py saved
Processing: chapter 10_logistic_regression, figure 10.05, script_name iris_logreg_loss_surface.py
iris_logreg_loss_surface.py saved
Processing: chapter 10_logistic_regression, figu

In [12]:
print("Total notebooks:", len(glob("../notebooks/book1/*/*.ipynb")))

Total notebooks: 174


### Save metadata

In [13]:
pd.to_pickle(master_metadata, "metadata_book1.pkl")

In [14]:
print("Everything is done in", time() - init, "seconds")

Everything is done in 8.67478609085083 seconds


# Appendix

## Chapter wise figure number map with scripts

In [15]:
def print_names(key):
    print(f"Chapter_{key}")
    print(yaml.dump(master_metadata[key]))


list(map(print_names, sorted(master_metadata)));

Chapter_01_introduction
'01.03':
- iris_plot.py
'01.05':
- linreg_residuals_plot.py
'01.06':
- linreg_2d_surface_demo.py
'01.07':
- linreg_poly_vs_degree.py
'01.08':
- iris_kmeans.py
'01.09':
- iris_pca.py
'01.12':
- mnist_viz_tf.py
- emnist_viz_pytorch.py
'01.13':
- fashion_viz_tf.py
- cifar_viz_tf.py

Chapter_02_probability_univariate_models
'02.01':
- discrete_prob_dist_plot.py
'02.02':
- gauss_plot.py
- quantile_plot.py
'02.04':
- bimodal_dist_plot.py
'02.05':
- anscombes_quartet.py
'02.06':
- datasaurus_dozen.py
'02.09':
- binom_dist_plot.py
'02.10':
- activation_fun_plot.py
'02.11':
- iris_logreg.py
'02.12':
- softmax_plot.py
'02.13':
- iris_logreg.py
'02.14':
- linreg_1d_hetero_tfp.py
'02.15':
- student_laplace_pdf_plot.py
'02.16':
- robust_pdf_plot.py
'02.17':
- beta_dist_plot.py
- gamma_dist_plot.py
'02.23':
- centralLimitDemo.py
'02.24':
- change_of_vars_demo1d.py

Chapter_03_probability_multivariate_models
'03.05':
- gauss_plot_2d.py
'03.06':
- gauss_plot_2d.py
'03.07':
- ga