In [40]:
import jax
import requests
from typing import Any
from TexSoup import TexSoup
import regex as re
import os
import pandas as pd
from glob import glob
import multiprocessing as mp
from probml_utils.url_utils import extract_scripts_name_from_caption
from IPython.display import clear_output

In [11]:
path1 = "../bookv2/book1/*/*.tex"
path2 = "../bookv2/book2/*/*.tex"

In [12]:
tex_files = glob(path2) + glob(path1)
len(tex_files)

607

In [13]:
def make_soup(tex_file_path):
    with open(tex_file_path, "r") as fp:
        contents = fp.read()
    try:
        obj = TexSoup(contents)
    except:
        print(f"{tex_file_path} failed to read!")
        return tex_file_path

In [14]:
pool = mp.Pool(mp.cpu_count() - 2)
soups_list = pool.map(make_soup, tex_files)

../bookv2/book2/pf/pf-other.tex failed to read!
../bookv2/book2/opt/bayesopt.tex failed to read!
../bookv2/book2/shift/adversarial.tex failed to read!
../bookv2/book2/lfm/ica.tex failed to read!
../bookv2/book2/vi/recursive-vi.tex failed to read!
../bookv2/book2/pred/testbed.tex failed to read!
../bookv2/book2/mcmc/hmc.tex failed to read!
../bookv2/book2/mcmc/sgmcmc.tex failed to read!
../bookv2/book2/comp/comp-methods.tex failed to read!
../bookv2/book2/ssm/hmm-short.tex failed to read!
../bookv2/book2/vi/intro-vi.tex failed to read!
../bookv2/book2/pf/proposals.tex failed to read!
../bookv2/book2/vi/vb.tex failed to read!
../bookv2/book2/mcmc/bigdata.tex failed to read!
../bookv2/book2/stats/bayes-solns.tex failed to read!
../bookv2/book2/nfm/vae.tex failed to read!
../bookv2/book2/pf/old.tex failed to read!
../bookv2/book2/prob/expfamLong.tex failed to read!
../bookv2/book2/flows/flows.tex failed to read!
../bookv2/book2/stats/josh.tex failed to read!
../bookv2/book2/lfm/topic-inf.t

In [55]:
# save to csv
defective_tex = []
with open("tex_defective.txt", "w") as fp:
    for each in soups_list:
        if each != None:
            defective_tex.append(each)
            print(each, file=fp)

In [41]:
len(tex_files), len(defective_tex)

(607, 48)

In [30]:
soups = {}
for i, file in enumerate(tex_files):
    clear_output(wait=True)
    print(i)
    if file not in defective_tex:
        with open(file, "r") as fp:
            soups[file] = TexSoup(fp.read())

606


In [42]:
fig_name_to_height_book1 = {}
fig_name_to_height_book2 = {}
repeated_figures = []
c = 0
for tex_file in soups:
    soup = soups[tex_file]
    for fig in soup.find_all("figure"):
        incl_graphs = fig.find_all("includegraphics")

        for line in incl_graphs:
            line = (
                str(line)
                .replace("\\twofigheight", "1.85in")
                .replace("\\textwidth", "*6in")
                .replace("\\dldir", "\\figdir")
            )
            # print(f"******* {line} ******")
            try:
                fig_height = re.findall(r"height=(.+?in)", str(line))[0]
            except IndexError:
                if "height" in line:
                    print(f"No height: {line}")
                # print(f"->>>>>>>>>> fig_height is missing!! <<<<<<<<<<<")
                continue

            try:
                fig_name = re.findall(r"figdir/(.+)?}", str(line))[0]
            except IndexError:
                print(f"No fig_name: {line}")
                continue

            if fig_name in fig_name_to_height and fig_name_to_height[fig_name] != fig_height:
                repeated_figures.append([fig_name, fig_height])
                c += 1
                # print(f"{fig_name} exists already")
                pass
            if "book1" in tex_file:
                fig_name_to_height_book1[fig_name] = fig_height
            else:
                fig_name_to_height_book2[fig_name] = fig_height

No height: \includegraphics[height = 0.6\linewidth, width=0.9\linewidth]{\figdir/model_view.png}


35

In [56]:
len(fig_name_to_height_book1)

630

In [57]:
len(fig_name_to_height_book2)

839

In [52]:
df1 = pd.DataFrame(
    zip(fig_name_to_height_book1.keys(), fig_name_to_height_book1.values()), columns=["fig_name", "fig_height"]
)
df1.sort_values(by="fig_name", inplace=True)
df1

Unnamed: 0,fig_name,fig_height
419,AAAquad,1.7in
415,AAAsolnExists,2in
394,AAAvectorNorms,1.5in
593,BERT-fig,1.5in
416,ConvexSetsHalfplanes,1.5in
...,...,...
528,xor-heaviside.pdf,1.5in
544,xorTable,1.5in
23,yeastHeatMap,2in
27,yeastKmeans16,2in


In [53]:
df2 = pd.DataFrame(
    zip(fig_name_to_height_book2.keys(), fig_name_to_height_book2.values()), columns=["fig_name", "fig_height"]
)
df2.sort_values(by="fig_name", inplace=True)
df2

Unnamed: 0,fig_name,fig_height
714,ADF2,2in
110,AIR-gen,2.5in
111,AIR-inf,2.5in
548,Agg_example1,1.5in
549,Agg_example2,1.5in
...,...,...
559,world-model-overview,1.5in
727,yaroslav-chain,0.6in
728,yaroslav-resnet,1in
729,yaroslav-skip,1in


In [54]:
df2["fig_height"].unique()

array(['2in', '2.5in', '1.5in', '1in', '4.5in', '0.75in', '3in', '0.6in',
       '4in', '1.1in', '1.75in', '1.2in', '1.85in', '1.8in', '1.25in',
       '1.65in', '0.2in', '0.8in', '2.7in', '0.5in', '1.525in', '2.25in',
       '1.3in', '0.23*6in', '3.5in', '.5in', '0.195*6in', '.8in', '2.0in',
       '1.4in', '1.6in', '1.57in', '0.181*6in', '0.6*6in', '2.75in',
       '2.2in'], dtype=object)

In [60]:
root_path = "../pyprobml/"

In [61]:
md = "# Book1 figures to height mapping\n"
md += df1.to_markdown(index=False)
with open(os.path.join(root_path, "internal/fig_height/fig_height_book1.md"), "w") as fp:
    fp.write(md)

In [62]:
md = "# Book2 figures to height mapping\n"
md += df2.to_markdown(index=False)
with open(os.path.join(root_path, "internal/fig_height/fig_height_book2.md"), "w") as fp:
    fp.write(md)