This notebook is doing following steps:
1. Take input as "pml1.lof" and create mapping of fig_no to urls. 
2. Create dummy notebook for figure which has multiple notebooks. Add caption (after preprocessing it) of such figure to this dummy notebooks which contains pointer to original notebooks of subfigures.
3. Again create mappig of fig_no to url but this time every fig_no has only one url
4. Check dead_urls 
5. Firestore experiments: raw code to upload this urls in firestore

In [1]:
from TexSoup import TexSoup
import regex as re
import os
import nbformat as nbf
import pandas as pd

try:
    from probml_utils.url_utils import (
        extract_scripts_name_from_caption,
        check_dead_urls,
        is_dead_url,
        github_url_to_colab_url,
        make_url_from_fig_no_and_script_name,
        figure_url_mapping_from_lof,
    )
except ModuleNotFoundError:
    %pip install git+https://github.com/probml/probml-utils.git
    from probml_utils.url_utils import (
        extract_scripts_name_from_caption,
        check_dead_urls,
        is_dead_url,
        github_url_to_colab_url,
        make_url_from_fig_no_and_script_name,
        figure_url_mapping_from_lof,
    )

## Mapping

In [2]:
fig_no_urls_mapping = figure_url_mapping_from_lof("pml1.lof", "")

Mapping of 219 urls is saved in 


In [3]:
fig_no_urls_mapping

{'1.3': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/iris_plot.ipynb',
 '1.4': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/iris_dtree.ipynb',
 '1.5': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/linreg_residuals_plot.ipynb',
 '1.6': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/linreg_2d_surface_demo.ipynb',
 '1.7': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/linreg_poly_vs_degree.ipynb',
 '1.8': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/iris_kmeans.ipynb',
 '1.9': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/iris_pca.ipynb',
 '1.12': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/fig_1_12.ipynb',
 '1.13': 'https://colab.research.google.c

### Multinotebooks

In [6]:
with open("pml1.lof") as fp:
    LoF_File_Contents = fp.read()
print(str(LoF_File_Contents))  # captions written in Latex

\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax 
\defcounter {refsection}{0}\relax 
\addvspace {10\p@ }
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.1}{\ignorespaces Three types of Iris flowers: Setosa, Versicolor and Virginica. Used with kind permission of Dennis Kramb and SIGNA. \relax }}{2}{figure.caption.8}%
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.2}{\ignorespaces Illustration of the image classification problem. From \url {https://cs231n.github.io/}. Used with kind permission of Andrej Karpathy. \relax }}{3}{figure.caption.9}%
\defcounter {refsection}{0}\relax 
\contentsline {figure}{\numberline {1.3}{\ignorespaces Visualization of the Iris data as a pairwise scatter plot. On the diagonal we plot the marginal distribution of each feature for each class. The off-diagonals contain scatterplots of all possible pairs of features. Generated by \href {https://colab.researc

In [7]:
LoF_File_Contents
soup = TexSoup(LoF_File_Contents)

In [8]:
more_than_one = 0
multi_notebooks = {}
for each in fig_no_urls_mapping:
    if "fig_" in fig_no_urls_mapping[each]:
        print(fig_no_urls_mapping[each])
        multi_notebooks[each] = fig_no_urls_mapping[each]
        more_than_one += 1
more_than_one

https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/fig_1_12.ipynb
https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/fig_1_13.ipynb
https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/02/fig_2_2.ipynb
https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/02/fig_2_17.ipynb
https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/04/fig_4_14.ipynb
https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/04/fig_4_20.ipynb
https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/05/fig_5_2.ipynb
https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/05/fig_5_10.ipynb
https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/08/fig_8_1.ipynb
https://colab.research.google.com/github/probml/pyprobml/blob/master/noteboo

25

### Caption reading and cleaning to put in notebook

In [9]:
def convert_to_ipynb(file):
    if ".py" in file:
        return file[:-3] + ".ipynb"
    return file

In [10]:
# create mapping of fig_no to list of script_name
py_pattern = r"\{\S+?\.py\}"
ipynb_pattern = r"\{\S+?\.ipynb\}"  # find {foo.ipynb} from caption

whole_link_ipynb = r"\{\S+\.ipynb\}"  # find {https://<path/to/>foo.ipynb}{foo.ipynb} from caption
whole_link_py = r"\{\S+\.py\}"

fig_cnt = 0
cleaned_caption = {}
for caption in soup.find_all("numberline"):
    fig_no = str(caption.contents[0])

    # if it does not contain multi_notebooks
    if fig_no not in multi_notebooks:
        continue

    caption = (
        str(caption)
        .replace(r"\ignorespaces", "")
        .replace(r" \relax", "")
        .replace(r"\href", "")
        .replace(r"\url", "")
        .replace(r'\cc@accent {"705E}', "")
        .replace(r"\numberline", "")
        .replace(r"\bm", "")
        .replace(r"\DOTSB", "")
        .replace(r"\slimits", "")
        .replace(r"\oset", "")
    )

    # print(fig_no, end=" ")
    links = re.findall(whole_link_ipynb, str(caption)) + re.findall(whole_link_py, str(caption))
    # print(fig_no, links)
    for link in links:
        script = extract_scripts_name_from_caption(link)[0]
        script_ipynb = convert_to_ipynb(script)
        original_url = f"[{script_ipynb}]({make_url_from_fig_no_and_script_name(fig_no,script_ipynb)})"  # in form of markdown hyperlink
        caption = caption.replace(link, original_url)

    caption = re.findall(r"{\d+.\d+}{(.*)}", caption)[0].strip()  # extract caption from {4.13}{caption}
    # print(fig_no, caption, end="\n\n")
    cleaned_caption[fig_no] = caption

cleaned_caption

{'1.12': '(a) Visualization of the MNIST dataset. Each image is $28 \\times 28$. There are 60k training examples and 10k test examples. We show the first 25 images from the training set. Generated by [mnist_viz_tf.ipynb](https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/mnist_viz_tf.ipynb) . (b) Visualization of the EMNIST dataset. There are 697,932 training examples, and 116,323 test examples, each of size $28 \\times 28$. There are 62 classes (a-z, A-Z, 0-9). We show the first 25 images from the training set. Generated by [emnist_viz_jax.ipynb](https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/emnist_viz_jax.ipynb) .',
 '1.13': '(a) Visualization of the Fashion-MNIST dataset \\citep{fashion}. The dataset has the same size as MNIST, but is harder to classify. There are 10 classes: T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker, Bag, Ankle-boot. We show the first 25 images from the training

### Write caption to dummy ipynb file

In [11]:
def make_dummy_notebook_name(fig_no):
    """
    convert 1.11 to fig_1_11.ipynb
    """
    return f"fig_{fig_no.replace('.','_')}.ipynb"

In [12]:
# create new notebook and add caption to it
# https://stackoverflow.com/questions/38193878/how-to-create-modify-a-jupyter-notebook-from-code-python
book_no = 1
cnt = 0
for fig_no in cleaned_caption:

    # make relative path for new dummy notebook
    chapter_no = int(fig_no.split(".")[0])
    relative_path = "../notebooks/"
    dummpy_notebook = make_dummy_notebook_name(fig_no)
    fig_path = os.path.join(relative_path, f"book{book_no}/{chapter_no:02d}", dummpy_notebook)
    print(fig_path.split("/")[-1], end="\n")

    nb = nbf.v4.new_notebook()
    nb["cells"] = [nbf.v4.new_markdown_cell(cleaned_caption[fig_no])]
    with open(fig_path, "w") as f:
        nbf.write(nb, f)
        cnt += 1
print(cnt)

fig_1_12.ipynb
fig_1_13.ipynb
fig_2_2.ipynb
fig_2_17.ipynb
fig_4_14.ipynb
fig_4_20.ipynb
fig_5_2.ipynb
fig_5_10.ipynb
fig_8_1.ipynb
fig_8_14.ipynb
fig_8_26.ipynb
fig_11_10.ipynb
fig_11_19.ipynb
fig_18_4.ipynb
fig_20_24.ipynb
fig_20_25.ipynb
fig_20_26.ipynb
fig_20_30.ipynb
fig_20_31.ipynb
fig_20_33.ipynb
fig_20_36.ipynb
fig_20_37.ipynb
fig_20_38.ipynb
fig_20_41.ipynb
fig_21_11.ipynb
25


### Check dead_urls if any

In [26]:
dead_url_status = check_dead_urls(fig_no_urls_mapping, print_dead_url=True)

https://github.com/probml/pyprobml/blob/master/notebooks/book1/01/fig_1_12.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/01/fig_1_13.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/11/fig_11_10.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/11/fig_11_19.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/18/fig_18_4.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/19/hbayes_maml.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/02/fig_2_17.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/02/fig_2_2.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/20/fig_20_24.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/20/fig_20_25.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/20/fig_20_26.ipynb
https://github.com/probml/pyprobml/blob/master/notebooks/book1/20/fig_20_30.ipynb
https://github.com/p

### Convert github url to colab

In [30]:
fig_no_urls_mapping_colab = {
    fig_no: github_url_to_colab_url(fig_no_urls_mapping[fig_no]) for fig_no in fig_no_urls_mapping
}
fig_no_urls_mapping_colab

{'1.3': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/iris_plot.ipynb',
 '1.4': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/iris_dtree.ipynb',
 '1.5': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/linreg_residuals_plot.ipynb',
 '1.6': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/linreg_2d_surface_demo.ipynb',
 '1.7': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/linreg_poly_vs_degree.ipynb',
 '1.8': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/iris_kmeans.ipynb',
 '1.9': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/iris_pca.ipynb',
 '1.12': 'https://colab.research.google.com/github/probml/pyprobml/blob/master/notebooks/book1/01/fig_1_12.ipynb',
 '1.13': 'https://colab.research.google.c

### Firestore experiments

In [1]:
try:
    from firebase_admin import credentials, firestore, initialize_app
except ModuleNotFoundError:
    %pip install firebase_admin
    from firebase_admin import credentials, firestore, initialize_app
from IPython.display import clear_output

import json
import os

In [2]:
key_path = "../../key_probml_gcp.json"
cred = credentials.Certificate(key_path)
default_app = initialize_app(cred)
db = firestore.client()
db

<google.cloud.firestore_v1.client.Client at 0x7fcef41cac40>

In [33]:
db.collection("figures").document("book1").collection("figures_new").document("1.3").get().to_dict()["link"]

'https://github.com/probml/pyprobml/blob/master/notebooks/book1/01/iris_plot.ipynb'

In [36]:
level1_collection = "figures"
level2_document = "book1"
level3_collection = "figures_new"

for fig_no in fig_no_urls_mapping:
    clear_output(wait=True)
    db.collection("figures").document("book1").collection("figures_new").document(fig_no).set(
        {"link": fig_no_urls_mapping_colab[fig_no]}, merge=True
    )
    print(fig_no)

22.4


In [42]:
df_colab = pd.DataFrame(fig_no_urls_mapping_colab.items(), columns=["fig_no", "url_colab"])
df_github = pd.DataFrame(fig_no_urls_mapping.items(), columns=["fig_no", "url_github"])
df = df_colab.merge(right=df_github, on="fig_no")
df.set_index(keys=["fig_no"], inplace=True, drop=True)
df.to_csv("old_url_mapping_book1.csv")
df

Unnamed: 0_level_0,url_colab,url_github
fig_no,Unnamed: 1_level_1,Unnamed: 2_level_1
1.3,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
1.4,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
1.5,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
1.6,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
1.7,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
...,...,...
21.17,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
21.18,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
21.19,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
22.3,https://colab.research.google.com/github/probm...,https://github.com/probml/pyprobml/blob/master...
