In [1]:
from typing import Union, List, Tuple
import subprocess
import os.path
import re

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def is_valid_latex_math(expr: str) -> bool:
    """Check whether a given string is a valid LaTeX math expression.

        N.B. Notice 'amsmath' is included as it is also used
        when saving .png pictures w. matplotlib
    Args:
        expr: A string representing a LaTeX math expression.

    Returns:
        A boolean indicating whether the expression is valid.

    Raises:
        None.

    Example:
        >>> expr = r'\frac{1}{2} + \sqrt{3}'
        >>> is_valid_latex_math(expr)
        True
    """
    try:
        # Create a LaTeX document with the expression inside a math environment
        input_str = f'\\documentclass{{article}}\\usepackage{{amsmath}}\\begin{{document}}$ {expr} $\\end{{document}}'
        # Compile the LaTeX document using pdflatex with batchmode and halt-on-error options
        subprocess.check_output(['pdflatex', '-halt-on-error', '-interaction=batchmode'], input=input_str.encode(), stderr=subprocess.STDOUT)
        # If there is no error, return True
        return True
    except subprocess.CalledProcessError as e:
        # If there is an error, return False
        return False

In [3]:
def filter_formulas(in_filename: str, nr_equations: int, seed: int = 1) -> List[str]:
    """
    Reads a file containing LaTeX formulas from 'in_filename' and removes all
    occurrences of the '\label{...}' substring from each formula. Each formula is
    separated by two newline characters.

    Parameters:
        in_filename : str
            The name of the input file containing LaTeX formulas.
        nr_equations : int
            The number of equations to choose from the input file.

    Returns:
        List[str] or None:
            The function returns a list of the filtered formulas if successful.
            Returns None if there is an error.

    Raises:
        FileNotFoundError:
            If the input file does not exist.
        IOError:
            If the input file cannot be opened for reading.
        Exception:
            If the specified number of equations to choose is greater than the number of available equations in the input file.
    """

    # Setting seed for numpy RNG
    np.random.seed(seed)

    # Check if the input file exists
    if not os.path.isfile(in_filename):
        raise FileNotFoundError(f"File not found: {in_filename}")

    # Importing initial formulas
    with open(in_filename, 'r', encoding='ISO-8859-1') as f:
        formulas = [line.strip() for line in f.readlines()]
    print(formulas[100])

    # Check if the number of equations to choose is greater than the number of available equations
    if nr_equations > len(formulas):
        raise Exception(f'{nr_equations} equations requested, but only {len(formulas)} available in file: {in_filename}')
    else:
        # Choose a random sample of equations without replacement
        formulas = np.random.permutation(formulas).tolist()

    # Define a regular expression pattern to match \label{...}, % and $ substrings
    pattern = re.compile(r'(\\label\{.*?\}|%|\$)')

    # Remove the \label{...}, % and $ substring from each string in the 'formulas' list
    first_filtering = [pattern.sub('', s) for s in formulas]

    # Remove strings that local latex engine doesn't recognize as genuine latex math
    final_filtering = []
    for idx in tqdm(range(int(len(first_filtering[:nr_equations])*1.1))):
        if len(final_filtering) < nr_equations:
            if is_valid_latex_math(first_filtering[idx]):
                final_filtering.append(first_filtering[idx])
        else:
            break

    # Return the final filtered list of formulas
    return final_filtering


In [4]:
def save_filter_formulas(filtered_formulas: List[str], out_filename: str) -> None:
    """
    Saves the filtered formulas to a file.

    Args:
        filtered_formulas: A list of filtered formulas to be saved.
        out_filename: The name of the output file.

    Returns:
        None
    """
    # Open the output file and write the filtered formulas to it
    with open(out_filename, 'w') as f:
        for line in filtered_formulas:
            # Write each formula to a new line and add a blank line after it
            f.write(line+'\n')


In [5]:
def to_inline_expr(math_expr: str) -> str:
    """
    Convert a LaTeX math expression to an inline math expression.

    Parameters
    ----------
    math_expr : str
        The LaTeX math expression to convert.

    Returns
    -------
    str
        The converted inline math expression.
    """
    return '$' + math_expr + '$'

In [6]:
def latex_to_image(math_expr: str, file_name: str, file_format: str = 'png',
                   picture_dims: Union[str,Tuple[float,float]] = 'A4',
                   resolution: Union[float, int] = 300, transparent: bool = True) -> None:
    """
    Converts a LaTeX math expression to an image in the specified file format and saves it with the specified file name.

    Args:
        math_expr (str): The LaTeX math expression to be converted to an image.
        file_name (str): The name of the file to be saved.
        file_format (str, optional): The format of the output file. Defaults to 'png'.
        picture_dims (Union[str,Tuple[float,float]], optional): The dimensions of the output picture. Can be either a
                                                               string specifying one of the standard dimensions (Letter,
                                                               Legal, A4, A5) or a tuple of the form (width, height).
                                                               Defaults to 'A4'.
        resolution (Union[float, int], optional): The resolution of the output image in DPI (dots per inch). Defaults to 300.
        transparent (bool, optional): Whether to save the image with a transparent background. Defaults to True.

    Raises:
        ValueError: If picture_dims is not one of the standard dimensions (Letter, Legal, A4, A5).
    """

    # Standard square dimensions of output picture
    std_dims = {"Letter":(8.5,14.0), "Legal":(8.5,14.0),
                "A4":(8.3,11.7), "A5":(5.8, 8.3)}

    if picture_dims in list(std_dims.keys()):
        DIMS = std_dims[picture_dims]
    else:
        raise ValueError(f"{picture_dims} not known - should be in: {list(std_dims.keys())}.")

    # Set the LaTeX font
    plt.rcParams['text.usetex'] = True
    plt.rcParams['text.latex.preamble'] = r'\usepackage{amsmath}'

    # Define the math expression as a string
    expr = to_inline_expr(math_expr=math_expr)

    # Add $ symbols to format the string as an inline math expression
    inline_expr = to_inline_expr(math_expr=math_expr)

    # Create a plot with the expression
    fig, ax = plt.subplots(figsize=DIMS)
    ax.text(0.5, 0.5, inline_expr, size=20, ha='center')

    # Remove the plot axes
    ax.set_axis_off()

    # Save the plot as a PNG with a transparent background
    plt.savefig(fname=file_name+"."+file_format, format=file_format, transparent=transparent, bbox_inches='tight', pad_inches=0.0, dpi=resolution)
    plt.close(fig)


In [202]:
# Filtering equations for unwanted tokens and illegitimate math expressions
original_filename = 'Data/im2latex_formulas.lst'
new_filename = 'Data/im2latex_formulas.txt'
filtered_formulas = filter_formulas(in_filename=original_filename,
                                    nr_equations=500)
print("Now has: ", len(filtered_formulas), " formulas")

 93%|█████████▎| 510/550 [01:58<00:09,  4.29it/s]

Now has:  500  formulas





In [203]:
# Rendering equations to A4 sized transparent .png files and saving locally
for line_nr in tqdm(range(len(filtered_formulas))):
    location = 'Data/im2latex_formulas_pictures/'
    filename = f'eq_{line_nr}'
    latex_to_image(math_expr=filtered_formulas[line_nr],file_name=location+filename)

100%|██████████| 500/500 [07:55<00:00,  1.05it/s]


In [206]:
# Writing the chosen equations to a file locally (same order as pictures)
save_filter_formulas(filtered_formulas=filtered_formulas,out_filename=new_filename)

In [229]:
# Filtering equations for unwanted tokens and illegitimate math expressions
original_filename = 'equations.lst'
new_filename = 'filtered_equations.txt'
filtered_formulas = filter_formulas(in_filename=original_filename,
                                    nr_equations=100)
print("Now has: ", len(filtered_formulas), " formulas")

& + \left. \frac{1}{T_m} \sumTs \tilde{f}_{m, s} \eta_{m, st} \iota_{mt} +


100%|██████████| 110/110 [00:14<00:00,  7.63it/s]

Now has:  27  formulas





In [225]:
filtered_formulas

['',
 '\\\\',
 '',
 '\\\\',
 '\\Omega_Z.I \\leq',
 'd \\hat{\\langle P\\rangle}/dR_2 \\\\',
 'm_{kj}^{z_l} = \\vec{\\Phi}(\\alpha_{kj}) \\bigg[m_{kj}^{a_l} + \\sqrt{v_{kj}^{a_l}}\\gamma_{kj}\\bigg]',
 '',
 '',
 'h_{t}(y_t \\mid z_{t})',
 'Z = \\sum_{c =1}^K \\left(\\frac{n_{-i,c} + \\alpha/K}{n - 1 + \\alpha} \\times f_{\\bf x}({\\bf x}_i | \\phi_c)\\right).',
 '',
 '         \\boldsymbol{w}_{t+1}-\\boldsymbol{w}_t=-\\eta \\left(\\nabla L(\\boldsymbol{w}_t)-\\frac{\\boldsymbol{z}_t^{(1)} \\boldsymbol{V_{k}^{pub}}+\\boldsymbol{z}_t^{(2)}}{n}\\right)',
 '\\frac{1}{T_2} \\sum_{t = j + T_1 + 1}^{T}',
 '     ',
 '',
 'EC_i=e_i,',
 '',
 '',
 '',
 '',
 '',
 '',
 '           = 0.',
 '',
 "+\\widetilde{\\Theta}_{k'}^{-}",
 '   :\\: g,h \\in A(\\mu, \\nu; \\varrho) \\Bigg\\},']

In [214]:
# Rendering equations to A4 sized transparent .png files and saving locally
for line_nr in tqdm(range(len(filtered_formulas))):
    location = 'Pictures/'
    filename = f'eq_{line_nr}'
    latex_to_image(math_expr=filtered_formulas[line_nr],file_name=location+filename)

  0%|          | 0/27 [00:00<?, ?it/s]


RuntimeError: latex was not able to process the following string:
b'$$'

Here is the full command invocation and its output:

latex -interaction=nonstopmode --halt-on-error ../1bcf7fb91a305bc2532e50e653b39cab.tex

This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Homebrew) (preloaded format=latex)
 restricted \write18 enabled.
entering extended mode
(../1bcf7fb91a305bc2532e50e653b39cab.tex
LaTeX2e <2021-11-15> patch level 1
L3 programming layer <2022-02-24>
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/articl
e.cls
Document Class: article 2021/10/04 v1.4n Standard LaTeX document class

(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/size10
.clo))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/type1cm/typ
e1cm.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/cm-super/ty
pe1ec.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/t1cmr.
fd))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/inpute
nc.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/geometry/ge
ometry.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/graphics/ke
yval.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/generic/iftex/ifv
tex.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/generic/iftex/ift
ex.sty)))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
math.sty
For additional information on amsmath, use the `?' option.

(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
text.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
gen.sty))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
bsy.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
opn.sty))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/underscore/
underscore.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/textco
mp.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/l3backend/l
3backend-dvips.def)
No file 1bcf7fb91a305bc2532e50e653b39cab.aux.
*geometry* driver: auto-detecting
*geometry* detected driver: dvips

LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <20> not available
(Font)              size <20.74> substituted on input line 29.


LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <13.99994> not available
(Font)              size <14.4> substituted on input line 29.

! Extra }, or forgotten $.
l.29 {\sffamily $$}
                   %
No pages of output.
Transcript written on 1bcf7fb91a305bc2532e50e653b39cab.log.




Error in callback <function _draw_all_if_interactive at 0x14f3db010> (for post_execute):


RuntimeError: latex was not able to process the following string:
b'$$'

Here is the full command invocation and its output:

latex -interaction=nonstopmode --halt-on-error ../1bcf7fb91a305bc2532e50e653b39cab.tex

This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Homebrew) (preloaded format=latex)
 restricted \write18 enabled.
entering extended mode
(../1bcf7fb91a305bc2532e50e653b39cab.tex
LaTeX2e <2021-11-15> patch level 1
L3 programming layer <2022-02-24>
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/articl
e.cls
Document Class: article 2021/10/04 v1.4n Standard LaTeX document class

(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/size10
.clo))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/type1cm/typ
e1cm.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/cm-super/ty
pe1ec.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/t1cmr.
fd))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/inpute
nc.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/geometry/ge
ometry.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/graphics/ke
yval.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/generic/iftex/ifv
tex.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/generic/iftex/ift
ex.sty)))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
math.sty
For additional information on amsmath, use the `?' option.

(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
text.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
gen.sty))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
bsy.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
opn.sty))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/underscore/
underscore.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/textco
mp.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/l3backend/l
3backend-dvips.def)
No file 1bcf7fb91a305bc2532e50e653b39cab.aux.
*geometry* driver: auto-detecting
*geometry* detected driver: dvips

LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <20> not available
(Font)              size <20.74> substituted on input line 29.


LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <13.99994> not available
(Font)              size <14.4> substituted on input line 29.

! Extra }, or forgotten $.
l.29 {\sffamily $$}
                   %
No pages of output.
Transcript written on 1bcf7fb91a305bc2532e50e653b39cab.log.




RuntimeError: latex was not able to process the following string:
b'$$'

Here is the full command invocation and its output:

latex -interaction=nonstopmode --halt-on-error ../1bcf7fb91a305bc2532e50e653b39cab.tex

This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Homebrew) (preloaded format=latex)
 restricted \write18 enabled.
entering extended mode
(../1bcf7fb91a305bc2532e50e653b39cab.tex
LaTeX2e <2021-11-15> patch level 1
L3 programming layer <2022-02-24>
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/articl
e.cls
Document Class: article 2021/10/04 v1.4n Standard LaTeX document class

(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/size10
.clo))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/type1cm/typ
e1cm.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/cm-super/ty
pe1ec.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/t1cmr.
fd))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/inpute
nc.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/geometry/ge
ometry.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/graphics/ke
yval.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/generic/iftex/ifv
tex.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/generic/iftex/ift
ex.sty)))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
math.sty
For additional information on amsmath, use the `?' option.

(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
text.sty
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
gen.sty))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
bsy.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/amsmath/ams
opn.sty))
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/underscore/
underscore.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/base/textco
mp.sty)
(/opt/homebrew/Cellar/texlive/20220321_3/share/texmf-dist/tex/latex/l3backend/l
3backend-dvips.def)
No file 1bcf7fb91a305bc2532e50e653b39cab.aux.
*geometry* driver: auto-detecting
*geometry* detected driver: dvips

LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <20> not available
(Font)              size <20.74> substituted on input line 29.


LaTeX Font Warning: Font shape `OMX/cmex/m/n' in size <13.99994> not available
(Font)              size <14.4> substituted on input line 29.

! Extra }, or forgotten $.
l.29 {\sffamily $$}
                   %
No pages of output.
Transcript written on 1bcf7fb91a305bc2532e50e653b39cab.log.




<Figure size 830x1170 with 1 Axes>