# dataloader test runs

## Pandoc

In [None]:
#!pixi add pypandoc

In [3]:
import pandas as pd
import glob

file_list = glob.glob("/home/rc/version-cv/data/googleresearch/mathwriting/data/*train*.parquet")

if not file_list:
    raise FileNotFoundError("No parquet files found!")

df_list = [pd.read_parquet(f) for f in file_list]
df = pd.concat(df_list, ignore_index=True)

print(df.head(5))

                                               image  \
0  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...   
1  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...   
2  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...   
3  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...   
4  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...   

                                               latex         sample_id  \
0                  (\begin{matrix}p\\ k\end{matrix})  8b7e03d7a0718f6c   
1               \frac{\frac{1}{176}}{\frac{469}{10}}  a9bcb7658c333bcf   
2  n_{rel}=\frac{n_{0}}{\sqrt{8\cdot\frac{v^{9}}{...  07c4d09473770d39   
3                                     \prod expxf(x)  0ccdf3b3965e14dd   
4                                      \overline{A}D  ff1b6cbcb064c602   

  split_tag data_type  
0     train     human  
1     train     human  
2     train     human  
3     train     human  
4     train     human  


In [8]:
from dataloader010 import load_datasets_pipeline
import io
from PIL import Image
import subprocess
import pypandoc
from transformers import AutoTokenizer
import matplotlib.pyplot as plt

# ------------------------
# Load dataset
# ------------------------
tf_train, tf_val, tf_test = load_datasets_pipeline()
ds = tf_train.dataset
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# ------------------------
# Test a small number of samples
# ------------------------
max_samples = 5
sample_count = 0

for batch in ds.take(1):  # Only one batch
    features, labels = batch
    images = features["image"].numpy()
    latex_ids = features["latex_ids"].numpy()
    batch_size = images.shape[0]

    for i in range(min(max_samples, batch_size)):
        decoded_latex = tokenizer.decode(latex_ids[i], skip_special_tokens=True)

        # ------------------------
        # pypandoc quick preview
        # ------------------------
        md_content = f"$$\n{decoded_latex}\n$$"
        html_output = pypandoc.convert_text(md_content, 'html', format='md')
        print("=== pypandoc HTML preview ===")
        print(html_output[:300])
        print("-" * 80)

        # ------------------------
        # pdflatex check
        # ------------------------
        document_pdflatex = f"""
        \\documentclass{{article}}
        \\usepackage{{amsmath}}
        \\pagestyle{{empty}}
        \\begin{{document}}
        $${decoded_latex}$$
        \\end{{document}}
        """

        with open("temp_formula_pdflatex.tex", "w") as f:
            f.write(document_pdflatex)

        result = subprocess.run(
            ["pdflatex", "-interaction=nonstopmode", "temp_formula_pdflatex.tex"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )

        if result.returncode == 0:
            print("pdflatex compilation successful")
        else:
            print("pdflatex compilation failed")
            print(result.stdout)

        # ------------------------
        # xelatex check
        # ------------------------
        document_xelatex = f"""
        \\documentclass{{article}}
        \\usepackage{{amsmath}}
        \\usepackage{{fontspec}}
        \\pagestyle{{empty}}
        \\begin{{document}}
        $${decoded_latex}$$
        \\end{{document}}
        """

        with open("temp_formula_xelatex.tex", "w") as f:
            f.write(document_xelatex)

        result_xe = subprocess.run(
            ["xelatex", "-interaction=nonstopmode", "temp_formula_xelatex.tex"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )

        if result_xe.returncode == 0:
            print("xelatex compilation successful")
        else:
            print("xelatex compilation failed")
            print(result_xe.stdout)

        # ------------------------
        # Visualize image
        # ------------------------
        plt.figure(figsize=(4, 4))
        plt.imshow(images[i].astype("uint8"))
        plt.axis("off")
        plt.title("Sample Image")
        plt.show()

        sample_count += 1

        if sample_count >= max_samples:
            break

    break 

ModuleNotFoundError: No module named 'dataloader010'