In [1]:
import pandas as pd
import cv2
import os
import pickle
import io
from PIL import Image

## Merge all to one parquet

In [2]:
def img_to_bytes(img_dir=None, given_img=None):
    # read img
    img = cv2.cvtColor(cv2.imread(img_dir), cv2.COLOR_BGR2RGB) if img_dir and not given_img else given_img
    # Convert the OpenCV image (numpy array) to a PIL Image
    pil_img = Image.fromarray(img)
    # Create a BytesIO object
    img_stream = io.BytesIO()
    # Save the image to the BytesIO object in your desired format (e.g., PNG)
    pil_img.save(img_stream, format='PNG')
    # Retrieve the byte data
    img_bytes = img_stream.getvalue()
    return img_bytes

In [3]:
def list_to_parquet(lis, save_dir):
    # Convert the list into a DataFrame
    df = pd.DataFrame(lis, columns=['formula', 'filename', 'image'])

    # Save the DataFrame to Parquet format using pandas
    # parquet_path = "m2e/m2e_train.parquet"
    df.to_parquet(save_dir, engine='fastparquet')
    return df

#### im2latex_100k

In [4]:
im2latex_parquet_dir = 'im2latex_100k/train.parquet'
im2latex_parquet = pd.read_parquet(im2latex_parquet_dir)
im2latex_parquet.head(3)

Unnamed: 0,formula,filename,image
0,\widetilde \gamma _ { \mathrm { h o p f } } \s...,66667cee5b.png,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
1,"( { \cal L } _ { a } g ) _ { i j } = 0 , \ \ \...",1cbb05a562.png,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
2,S _ { s t a t } = 2 \pi \sqrt { N _ { 5 } ^ { ...,ed164cc822.png,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...


#### m2e

In [5]:
# len('images/')
# 'm2e/images/5388.jpg'[11:]
'm2e/images/5388.jpg'[11:-3] + 'png'

'5388.png'

In [6]:
m2e_dir_csv = 'm2e/train.csv'

img_dir_cap_maps = []
with open(m2e_dir_csv, 'r', encoding='utf-8') as f:
    for line in f:
        # print(line)
        img_dir, cap = line.split(',', 1)
        # print(img_dir, cap)
        img_dir = 'm2e/' + img_dir[4:]
        if os.path.exists(img_dir):
            img_bytes = img_to_bytes(img_dir=img_dir)
            img_name = img_dir[11:-3] + 'png'  # saved as png
            img_dir_cap_maps.append([cap, img_name, img_bytes])
        else:
            print(f'Image not found from {img_dir}')

img_dir_cap_maps[:3]

Image not found from m2e/e Path


[['\\frac { 5 } { 6 } + \\frac { 5 } { 3 } \\times \\frac { 4 } { 5 } \\n = \\frac { 5 } { 6 } + \\frac { 4 } { 3 } \\n = \\frac { 5 } { 6 } + \\frac { 8 } { 6 } \\n = \\frac { 1 3 } { 6 } \\n = 2 \\frac { 1 } { 6 }\n',
  '34105.png',
  b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\xba\x00\x00\x01\x1c\x08\x02\x00\x00\x00\xff^9\xe1\x00\x00\xba\xecIDATx\x9c\xc4\xfdg\x93\x1cI\xb2 \x08\xaa\x9a9\x0bN\x93\x13d\x82\x14\x80"\xdd\xfd\xba_\x8f\x0c\xd9\x1d\x91\xfb\xa3\xfb\xf9\xe4\xbe\xde\xa7\x93\x91\x91\xbd\x93\xdb\x19\xd9\xd9\x91y\xdd]]]U(\x14\x80Dr\x1e<\xc2\x99\x99\xde\x07\r\xb7\xb4p\x8fH\xa0\xaa{\xde\x99\x94\xa0<=\xcc\x8d\xa8\xa9)3U5\xfc\x7f\xfd\xdf\xff7x\xb4 \xa2\xf9\x97\x0b\x11i\xad\x11\x91_\xf2\x9fD$\xa5\x94R\xf2\x9fZk!\x84\x10"V\x9a\xeb\xd8M!b\x92$\x98\x15\xbb;\xad\xb5\xa9\xc3\x7fr\x83R\xca\xa5\xc3\xe3\x96\xcd\'B\x08~PJ\xf1\x83\xce\n\x00\x08!\x00\xd1\x0cF\x08\xc1]h\xad\xe38\xe6\xf1\xf3K\xd3\xa0\x99\x11e\x85\x7fu\x1cg)\x1c\x10\x85\xf9\x93\x0b\xc3\x81\xc7oZ0\x0f\x02P\x08\xa1\xb5N\xd3\x94

In [7]:
df = list_to_parquet(img_dir_cap_maps, 'all_parques/m2e_training.parquet')
all(pd.read_parquet('all_parques/m2e_training.parquet') == df)

True

#### icdar

In [8]:
# len('icdar/train_set/train_images/')
# 'icdar/train_set/train_images/train_0.jpg'[29:]
'icdar/train_set/train_images/train_0.jpg'[29:-3] + 'png'

'train_0.png'

In [9]:
icdar_img_dir = 'icdar/train_set/train_images/'
icdar_map = 'icdar/icdar_img_latx_map.csv'

icdar_img_dir_cap_map = []
with open(icdar_map, 'r', encoding='utf-8') as f:
    for line in f:
        img_dir, cap = line.split(',', 1)
        img_dir = 'icdar/train_set/' + img_dir
        if os.path.exists(img_dir):
            # print(img_dir, cap)
            img_bytes = img_to_bytes(img_dir=img_dir)
            img_name = img_dir[29:-3] + 'png'
            icdar_img_dir_cap_map.append([cap, img_name, img_bytes])
        else:
            print(f'Image not found from {img_dir}')

icdar_img_dir_cap_map[:3]

Image not found from icdar/train_set/Image Path


[['( x - 2 ) ( x + 1 ) = 0\n',
  'train_0.png',
  b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00n\x00\x00\x00\x12\x08\x02\x00\x00\x005\xd9|\x16\x00\x00\r\xfbIDATx\x9cMXIW\x1cW\xb2\xbesfV\xd6\x905@\rP\x02d\x90@\x83\x8d\x87\xd7v\xdb\xe7\xf8\x0f\xf4\x8f|\xfb^\xbc\xcd[\xbc\x85\xcf\xb1\x17\xef\xd8Z\xc8-\x84\r\xc2\x16\xa6\x10\x12C1U\x155\xe4p\xc7^D\x91\xad<,\x8a\xcc;D|\x11\xf1\xc5\x80\xff\xef\x7f\xff\x89\x10\xb2\xd6:\xe70v\xe8\xfe\xb1\xd6*\xa5\x08a\x84\x10\x84\x901\x863Ok\x8d\xac\xc5\x18#d1\xc6\xd6Zk-\xe7T)\x85\t"\x84PJ\x95R\xce"\x8c\xb1\xd6\x9aR\n{\t!\x18c\xe7\x1c\\D\x08\xb1\xd6"\x84(\xa5Y\x96a\x8c9\xe7\xc6\x18\x84\x90\xc5\x08cL\x116\xc6 \xec\x18cp\x0e|u\x161\xc6\x94R\x18cB\x88\xd6\x1a\xc4`\x8cYk)\xa5Zk\x84\x10\xc6\x18a\x07\x8f\xb5V\x08a\x8c\xd1\xf7W#\x84\xac\xd6\x94R\x90\x019\x0c{a\xbdR\x8as\x0fdF\x089g\xe0@c\x0clq\xceQJ\xe1^\x84\x90ss\xd0\x18\xec\x07=\xe1\x15\x08\x8d\x10\x82\xb30\xc6\x94R\xc6\x98\xb3\xd89G\t\xb1\xd6j\xad\x10B\x8c1\xb8\x1bcL)1\xc6\xc0^\x8c\t\xa0\x8c1\x86s@%J)!\xe4~\r6\x

In [10]:
df = list_to_parquet(icdar_img_dir_cap_map, 'all_parques/icdar_train.parquet')
all(pd.read_parquet('all_parques/icdar_train.parquet') == df)

True

#### hmer

In [11]:
len('extracted_images/')
'extracted_images/train_320.jpg'[17:]

'train_320.jpg'

In [12]:
hmer_img_dir = 'hmer/train/images.pkl'
hmer_map = 'hmer/hmer_train_mapping.csv'

# Load the pickle file containing image arrays
with open(hmer_img_dir, 'rb') as pkl_file:
    image_data = pickle.load(pkl_file)

hmer_img_dir_cap_map = []
with open(hmer_map, 'r') as f:
    for line in f:
        img_dir, caption = line.split(',', 1)
        img_name = img_dir[17:-3] + 'png'  # save to png
        img_dir = img_dir[17:]  # endswith jpg
        # Get image array from the loaded pickle data
        if img_dir in image_data:
            img_array = image_data[img_dir]  # Retrieve the image array
            # Convert image array to bytes
            img_bytes = img_to_bytes(given_img=img_array)
            hmer_img_dir_cap_map.append([caption, img_name, img_bytes])
        else:
            print(f'Image: {img_name}/.jpg not found from {hmer_img_dir}')

hmer_img_dir_cap_map[:3]

Image: png/.jpg not found from hmer/train/images.pkl


[['( 2 ) 2 N a O H + C u S O _ { 4 } = N a _ { 2 } S O _ { 4 } + C u ( O H ) _ { 2 } \\downarrow\n',
  'train_0.png',
  b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x04\xf0\x00\x00\x00x\x08\x00\x00\x00\x00Y\x04Sy\x00\x00\xe0vIDATx\x9c\xcc\xfd\xd9\x92%\xc7\x96%\x06\xae\xb5U\xcd\xce\xe4C\x0c\x08\x0cw\xcc\xcc\xaa\xea,V\xb3H!_(\xfd\x19\xfd\x91\xfd\x1d-\xc2\x87\x16i\xa1\x90R"\xacf\x153+3\xef\x04 \x80\x88\xf0\xf0\xe9\x0cf\xa6\xbaW?\xa8\x9a\x1d;>\x04\x80{\x91\xc9R\x01\xc2\xdd\xcf\xb1A\x87\xad{\xaf=*\xff_\x90\x00\x00$0\xfb\xdd\x81\xf2;\x00\xcd~7\xfc\x0b4\xf1\xf1\'\xc1z\xcf\xc1(y \x04\x80\x98\xf5\xeb\x07\x1ex\xbc\x8e\x00\xebm\x9c\x7f~z\xcd\xa3\x0e\x1c\xef\x1e\xe7CVgI`\x99\xbd\xe99*O\xe0i\xf7\xf4\xf0Q*#%\xe0\x04!B`}\xe0_\xd2\x08y\xe9\xa8\xc4\xb1\xd3<\x0e@\x1c;\'`\xea\xc3\'\xfa\xa92\xdf$!\x91>\xbba\xfa\xb6\xfe9\x9b:Ib\x9d\x04>s\xfd\xf8F\xd1=\x96)\x90(\x07i(\xbd\x87\x91\x90\xe7\xd3\xe7<\xfc\xfd\xc7\xce\xcd\xc31\x8a$\x90\xe5 \x9d$\x04\tF\x01\x04\x8cu\x0eM\x1a\xe4\xd9\x08\xb3\xd6\xbd#\x03\x04\x89\x

In [13]:
df = list_to_parquet(hmer_img_dir_cap_map, 'all_parques/hmer_train.parquet')
all(pd.read_parquet('all_parques/hmer_train.parquet') == df)

True

#### crohme

In [14]:
len('dataset/crohme/train/extracted_img/')
'dataset/crohme/train/extracted_img/formulaire039-equation040.png'[35:-4]

'formulaire039-equation040'

In [15]:
crohme_img_dir = 'crohme/train/images.pkl'
crohme_map = 'crohme/crohme_labels.csv'

# Load the pickle file containing image arrays
with open(crohme_img_dir, 'rb') as pkl_file:
    image_data_crohme = pickle.load(pkl_file)
print(list(image_data_crohme.items())[0])

('110_edwin', array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8))


In [16]:
crohme_img_dir_cap_map = []
with open(crohme_map, 'r') as f:
    for line in f:
        line = line.strip()
        img_dir, caption = line.split(',', 1)
        img_name = img_dir[35:]  # endswith png
        img_dir = img_dir[35:-4]  # remove suffix
        # Get image array from the loaded pickle data
        if img_dir in image_data_crohme:
            img_array = image_data_crohme[img_dir]  # Retrieve the image array
            # Convert image array to bytes
            img_bytes = img_to_bytes(given_img=img_array)
            crohme_img_dir_cap_map.append([caption, img_name, img_bytes])
        else:
            print(f'Image: {img_dir} not found from {crohme_img_dir}')

len(crohme_img_dir_cap_map)

Image:  not found from crohme/train/images.pkl


8834

In [17]:
crohme_img_dir_cap_map[0]

['1',
 '200924-1331-216.png',
 b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x1a\x00\x00\x006\x08\x00\x00\x00\x00\xfcO\xd6_\x00\x00\x00 IDATx\x9cc`\x18\x05\x94\x02Fd\xce\x7f\x14>\x13n]\xa3R\xa3R\xa3R\xa3\x80T\x00\x00\x13\x1a\x01>\x11\xba\x0eT\x00\x00\x00\x00IEND\xaeB`\x82']

In [18]:
df = list_to_parquet(crohme_img_dir_cap_map, 'all_parques/crohme_train.parquet')
all(pd.read_parquet('all_parques/crohme_train.parquet') == df)

True

### merge to one parquet

In [20]:
crohme_pq = 'all_parques/crohme_train.parquet'
hmer_pq = 'all_parques/hmer_train.parquet'
icdar_pq = 'all_parques/icdar_train.parquet'
im2latex_pq = 'all_parques/im2latex_train.parquet'  # {'bytes': b'\x89PNG\r\n\x1a\n\x00
m2e_pq = 'all_parques/m2e_training.parquet'  

pqs = [crohme_pq, hmer_pq, icdar_pq, im2latex_pq, m2e_pq]
total_imgs = 0

for pq in pqs:
    df = pd.read_parquet(pq)
    total_imgs += len(df)
    # print(df)
print(total_imgs)

248348


In [21]:
# Load all parquet files into a list of DataFrames
dfs = [pd.read_parquet(pq_file) for pq_file in pqs]

# Fix the 'im2latex_pq' DataFrame to extract only the bytes data from the dict in the 'image' column
dfs[3]['image'] = dfs[3]['image'].apply(lambda x: x['bytes'] if isinstance(x, dict) and 'bytes' in x else x)

# Combine all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Save the combined DataFrame to a single parquet file
output_parquet = 'training_data.parquet'
combined_df.to_parquet(output_parquet, index=False)

In [22]:
final = pd.read_parquet('training_data.parquet')
len(final)

248348

In [23]:
final

Unnamed: 0,formula,filename,image
0,1,200924-1331-216.png,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...
1,1,200923-131-185.png,"b""\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\..."
2,1,200923-1553-117.png,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...
3,l,200923-1251-17.png,"b""\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\..."
4,l,200923-1556-256.png,"b""\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\..."
...,...,...,...
248343,2 5 \div \left( 1 + \frac { 1 } { 4 } \right) ...,32732-ex.png,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...
248344,\frac { 1 } { 3 } \div \frac { 3 } { 5 } - \fr...,69290-ex.png,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...
248345,v 周 ： 1 \div 1 5 = \frac { 1 } { 1 5 } \n v 等 ...,43291-ex.png,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...
248346,⑦ 0 . 1 5 : 0 . 4 5 \n = 0 . 1 5 \div 0 . 4 5 ...,90592-ex.png,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...


## Read from parquet

```python
data = pd.read_parquet('training_data.parquet')

captions, img_names, img_bytes = data['formula'], data['filename'], data['image']
# len(captions) == len(img_name) == len(img_bytes)  # check


for img in img_bytes:
    # Convert byte data to a BytesIO object
    image_stream = io.BytesIO(img)
    # Open the image using PIL
    image = Image.open(image_stream)
    # image.show()  # Show the image (optional, will open a window with the image)
    # Save the image to a file (optional)
    image.save(save_path)
```