In [134]:
import re
from glob import glob
import requests
import pandas as pd
import os
from probml_utils.url_utils import is_dead_url,make_url_from_chapter_no_and_script_name, extract_scripts_name_from_caption
from TexSoup import TexSoup

## Get chapter names

In [130]:
chap_names = {}
for chap_no in range(1,24):
    suppl = f"../../pml-book/pml1/supplements/chap{chap_no}.md"
    with open(suppl, "r") as fp:
        text = fp.read()
    names = re.findall(r"Chapter.+?[(](.+)[)]",text)
    chap_names[chap_no] = names[0]
    print(chap_no, names)

1 ['Introduction']
2 ['Probability: univariate models']
3 ['Probability: multivariate models']
4 ['Statistics']
5 ['Decision theory']
6 ['Information theory']
7 ['Linear algebra']
8 ['Optimization']
9 ['Linear discriminant analysis']
10 ['Logistic regression']
11 ['Linear regression']
12 ['Generalized linear models']
13 ['Neural networks for unstructured data']
14 ['Neural networks for images']
15 ['Neural networks for sequences']
16 ['Exemplar-based methods']
17 ['Kernel methods']
18 ['Trees']
19 ['Learning with fewer labeled examples']
20 ['Dimensionality reduction']
21 ['Clustering']
22 ['Recommender systems']
23 ['Graph embeddings']


In [131]:
df = pd.DataFrame(chap_names.items(), columns=["chap_no","chap_name"])
df

Unnamed: 0,chap_no,chap_name
0,1,Introduction
1,2,Probability: univariate models
2,3,Probability: multivariate models
3,4,Statistics
4,5,Decision theory
5,6,Information theory
6,7,Linear algebra
7,8,Optimization
8,9,Linear discriminant analysis
9,10,Logistic regression


In [132]:
df.to_csv("chapter_no_to_name_mapping.csv", index=None)

## Create a Readme.md

In [133]:
content = '''
# "Probabilistic Machine Learning: An Introduction"

## Chapters
|Chapter|Name| Notebooks|
|-|-|-|
'''

for chap_no in range(1,24):
    chap_url = f"https://github.com/probml/pyprobml/tree/master/notebooks/book1/{chap_no:02d}"
    content+=f"| {chap_no} | {chap_names[chap_no]} | [{chap_no:02d}/]({chap_no:02d}/) |\n"
content

'\n# "Probabilistic Machine Learning: An Introduction"\n\n## Chapters\n|Chapter|Name| Notebooks|\n|-|-|-|\n| 1 | Introduction | [01/](01/) |\n| 2 | Probability: univariate models | [02/](02/) |\n| 3 | Probability: multivariate models | [03/](03/) |\n| 4 | Statistics | [04/](04/) |\n| 5 | Decision theory | [05/](05/) |\n| 6 | Information theory | [06/](06/) |\n| 7 | Linear algebra | [07/](07/) |\n| 8 | Optimization | [08/](08/) |\n| 9 | Linear discriminant analysis | [09/](09/) |\n| 10 | Logistic regression | [10/](10/) |\n| 11 | Linear regression | [11/](11/) |\n| 12 | Generalized linear models | [12/](12/) |\n| 13 | Neural networks for unstructured data | [13/](13/) |\n| 14 | Neural networks for images | [14/](14/) |\n| 15 | Neural networks for sequences | [15/](15/) |\n| 16 | Exemplar-based methods | [16/](16/) |\n| 17 | Kernel methods | [17/](17/) |\n| 18 | Trees | [18/](18/) |\n| 19 | Learning with fewer labeled examples | [19/](19/) |\n| 20 | Dimensionality reduction | [20/](20/) 

In [6]:
readme_file = "../notebooks/book1/README.md"
with open(readme_file,"w") as fp:
    fp.write(content)

## Chapterwise README.md

In [135]:
with open("pml1.lof") as fp:
    LoF_File_Contents = fp.read()
    soup = TexSoup(LoF_File_Contents)
    
    # create mapping of fig_no to list of script_name

    url_mapping = {}
    for caption in soup.find_all("numberline"):
        fig_no = str(caption.contents[0])
        extracted_scripts = extract_scripts_name_from_caption(str(caption))
        if len(extracted_scripts) == 1:
            url_mapping[fig_no] = extracted_scripts[0]+""
        elif len(extracted_scripts) > 1:
            url_mapping[fig_no] = "fig_"+fig_no.replace(".","_")+".ipynb"
        else:
            url_mapping[fig_no] = ""

In [136]:
url_mapping

{'1.1': '',
 '1.2': '',
 '1.3': 'iris_plot.ipynb',
 '1.4': 'iris_dtree.ipynb',
 '1.5': 'linreg_residuals_plot.ipynb',
 '1.6': 'linreg_2d_surface_demo.ipynb',
 '1.7': 'linreg_poly_vs_degree.ipynb',
 '1.8': 'iris_kmeans.ipynb',
 '1.9': 'iris_pca.ipynb',
 '1.10': '',
 '1.11': '',
 '1.12': 'fig_1_12.ipynb',
 '1.13': 'fig_1_13.ipynb',
 '1.14': '',
 '1.15': '',
 '2.1': 'discrete_prob_dist_plot.ipynb',
 '2.2': 'fig_2_2.ipynb',
 '2.3': '',
 '2.4': 'bimodal_dist_plot.ipynb',
 '2.5': 'anscombes_quartet.ipynb',
 '2.6': 'datasaurus_dozen.ipynb',
 '2.7': '',
 '2.8': '',
 '2.9': 'binom_dist_plot.ipynb',
 '2.10': 'activation_fun_plot.ipynb',
 '2.11': 'iris_logreg.ipynb',
 '2.12': 'softmax_plot.ipynb',
 '2.13': 'iris_logreg.ipynb',
 '2.14': 'linreg_1d_hetero_tfp.ipynb',
 '2.15': 'student_laplace_pdf_plot.ipynb',
 '2.16': 'robust_pdf_plot.ipynb',
 '2.17': 'fig_2_17.ipynb',
 '2.18': '',
 '2.19': '',
 '2.20': '',
 '2.21': '',
 '2.22': '',
 '2.23': 'centralLimitDemo.ipynb',
 '2.24': 'change_of_vars_demo1d

In [137]:
chapter_wise_mappping = {}
for fig_no in url_mapping:
    chap_no = int(fig_no.split(".")[0])
    if chap_no not in chapter_wise_mappping:
        chapter_wise_mappping[chap_no] = {}
    chapter_wise_mappping[chap_no][fig_no] = url_mapping[fig_no]
chapter_wise_mappping

{1: {'1.1': '',
  '1.2': '',
  '1.3': 'iris_plot.ipynb',
  '1.4': 'iris_dtree.ipynb',
  '1.5': 'linreg_residuals_plot.ipynb',
  '1.6': 'linreg_2d_surface_demo.ipynb',
  '1.7': 'linreg_poly_vs_degree.ipynb',
  '1.8': 'iris_kmeans.ipynb',
  '1.9': 'iris_pca.ipynb',
  '1.10': '',
  '1.11': '',
  '1.12': 'fig_1_12.ipynb',
  '1.13': 'fig_1_13.ipynb',
  '1.14': '',
  '1.15': ''},
 2: {'2.1': 'discrete_prob_dist_plot.ipynb',
  '2.2': 'fig_2_2.ipynb',
  '2.3': '',
  '2.4': 'bimodal_dist_plot.ipynb',
  '2.5': 'anscombes_quartet.ipynb',
  '2.6': 'datasaurus_dozen.ipynb',
  '2.7': '',
  '2.8': '',
  '2.9': 'binom_dist_plot.ipynb',
  '2.10': 'activation_fun_plot.ipynb',
  '2.11': 'iris_logreg.ipynb',
  '2.12': 'softmax_plot.ipynb',
  '2.13': 'iris_logreg.ipynb',
  '2.14': 'linreg_1d_hetero_tfp.ipynb',
  '2.15': 'student_laplace_pdf_plot.ipynb',
  '2.16': 'robust_pdf_plot.ipynb',
  '2.17': 'fig_2_17.ipynb',
  '2.18': '',
  '2.19': '',
  '2.20': '',
  '2.21': '',
  '2.22': '',
  '2.23': 'centralLimi

In [138]:
book1_figures = os.listdir("../../pml-book/book1-figures/")
image_mapping = {}
for each in book1_figures:
    fig_no = re.findall(r"\d+\.\d+", each)[0]
    try:
        image_mapping[fig_no].append(each)
    except:
        image_mapping[fig_no] = [each]
image_mapping

{'14.27': ['Figure_14.27_A.jpg', 'Figure_14.27_B.png'],
 '21.18': ['Figure_21.18.png'],
 '15.22': ['Figure_15.22_A.png', 'Figure_15.22_B.png'],
 '8.5': ['Figure_8.5_A.png', 'Figure_8.5_B.png'],
 '4.6': ['Figure_4.6.png'],
 '19.8': ['Figure_19.8_B.png', 'Figure_19.8_A.png'],
 '11.9': ['Figure_11.9_B.png', 'Figure_11.9_A.png'],
 '7.8': ['Figure_7.8_B.png', 'Figure_7.8_A.png'],
 '8.17': ['Figure_8.17.png'],
 '19.13': ['Figure_19.13.png'],
 '14.13': ['Figure_14.13.png'],
 '16.5': ['Figure_16.5_A.png', 'Figure_16.5_B.png'],
 '23.3': ['Figure_23.3.png'],
 '20.26': ['Figure_20.26_B.png', 'Figure_20.26_A.png'],
 '13.13': ['Figure_13.13.png'],
 '13.15': ['Figure_13.15_A.png', 'Figure_13.15_B.png'],
 '14.16': ['Figure_14.16_B.png', 'Figure_14.16_A.png'],
 '15.6': ['Figure_15.6.png'],
 '7.2': ['Figure_7.2_A.png', 'Figure_7.2_B.png'],
 '15.3': ['Figure_15.3.png'],
 '21.21': ['Figure_21.21_B.png', 'Figure_21.21_A.png'],
 '14.17': ['Figure_14.17_B.png', 'Figure_14.17_A.png'],
 '13.6': ['Figure_13.6_

In [139]:
def get_figure_text(fig_no):
    if fig_no not in image_mapping:
        return "-"

    url = "https://github.com/probml/pml-book/blob/main/book1-figures/"
    text = ""
    for fig in image_mapping[fig_no]:
        text += f"[{fig}]({os.path.join(url,fig)})<br/>"
    return text

In [140]:
def extract_url(line):
    links = re.findall(r"(https.+)?\)" ,line)
    if links:
        return links
    return None

In [146]:
dead = []
for chap_no in chapter_wise_mappping:
    if chap_no == 23:
        continue #not present in pyprobml
    content = f'''
# Chapter {chap_no}: {chap_names[chap_no]}

## Figures

|Figure No. | Notebook | Figure |
|--|--|--|
'''
    for fig_no in chapter_wise_mappping[chap_no]:
        notebook_link = f"[{chapter_wise_mappping[chap_no][fig_no]}]({chapter_wise_mappping[chap_no][fig_no]})" if chapter_wise_mappping[chap_no][fig_no] != "" else "-"
        content += f"| {fig_no} | {notebook_link} "
        content+= f"| {get_figure_text(fig_no)} |\n"
        
    # append supplementary 
    
    
    
    suppl = f"../../pml-book/pml1/supplements/chap{chap_no}.md"
    with open(suppl, "r") as fp:
        text = fp.read()
    print(chap_no,len(text.split("\n")))
    if len(text.split("\n")) > 3:
        content += "## Supplementary material\n"
        text = "\n".join(text.split("\n")[1:])
        #change tutorial location from probml_notebooks to pyprobml
        text = text.replace("https://github.com/probml/probml-notebooks/blob/main/markdown/","https://github.com/probml/pyprobml/tree/master/tutorials/")
        content+=text
    
    #print(content)
    
    # save this as README.md
    readme_file = f"../notebooks/book1/{chap_no:02d}/README.md"
    with open(readme_file,"w") as fp:
        fp.write(content)


1 12
2 4
3 4
4 5
5 5
6 3
7 4
8 9
9 3
10 6
11 9
12 3
13 13
14 17
15 20
16 4
17 3
18 5
19 6
20 10
21 3
22 5


In [None]:
        
#     for line in lines:
#         links = extract_url(line)
#         if links:
#             for link in links:
#                 if "http" in link and is_dead_url(link):
#                     print(link)
#                     dead.append(link)
#     text = "\n".join(lines)
#     content+=text