In [1]:
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import os
from dataset import Latex_Dataset, Vocabulary, collate_fn, PadToMaxSize
from torchvision.transforms import functional as F
from torchvision.transforms import Compose, ToTensor, Normalize
import matplotlib.pyplot as plt
import numpy as np
from  model import Im2LatexModel
import torch.optim as optim

In [2]:
def get_all_tokens(csv_file):
    """
    Extracts all unique tokens from the LaTeX formulas in the dataset.

    Args:
        csv_file (str): The path to the CSV file containing the dataset.

    Returns:
        set: A set of all unique tokens in the dataset.
    """
    data = pd.read_csv(csv_file)
    formulas = data.iloc[:, 0]  # Assuming formulas are in the first column
    all_tokens = set()

    for formula in formulas:
        tokens = formula.split()
        all_tokens.update(tokens)

    return all_tokens


In [3]:
def find_max_dimensions(root_dir):
    max_width = 0
    max_height = 0

    # Assuming the root_dir contains all your images directly
    for image_name in os.listdir(root_dir):
        image_path = os.path.join(root_dir, image_name)
        with Image.open(image_path) as img:
            width, height = img.size
            max_width = max(max_width, width)
            max_height = max(max_height, height)

    return max_height, max_width

# Example usage
root_dir = 'data/formula_images_processed/formula_images_processed'
max_height, max_width = find_max_dimensions(root_dir)
print(f"Maximum dimensions are {max_height}x{max_width}")

Maximum dimensions are 800x800


In [4]:
vocab = Vocabulary()
# Imagine you have a function to iterate over all tokens in your dataset
for token in get_all_tokens('data/im2latex_train.csv'):
    vocab.add_token(token)

In [5]:
# Transformations
transform = Compose([
    PadToMaxSize(max_width, max_height),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize with ImageNet mean and std
])

train_dataset = Latex_Dataset(csv_file='data/im2latex_train.csv', root_dir='data/formula_images_processed/formula_images_processed', vocab=vocab, transform=transform)
test_dataset = Latex_Dataset(csv_file='data/im2latex_test.csv', root_dir='data/formula_images_processed/formula_images_processed', vocab=vocab, transform=transform)
validate_dataset = Latex_Dataset(csv_file='data/im2latex_validate.csv', root_dir='data/formula_images_processed/formula_images_processed', vocab=vocab, transform=transform)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2, collate_fn=collate_fn)
validate_loader = DataLoader(validate_dataset, batch_size=64, shuffle=False, num_workers=2, collate_fn=collate_fn)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
model = Im2LatexModel().to(device)

# Choose an optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Assuming you have a function to convert LaTeX strings to a tensor of token IDs
# def latex_to_token_ids(latex_strings): ...

num_epochs = 10  # Number of epochs to train for

# Training loop
model.train()  # Set the model to training mode
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, data in enumerate(train_loader):
        images = data['image'].to(device)  # Move images to the device (CPU or GPU)
        latex_encodings = data['indices'].to(device)  # Move target sequences to the device

        optimizer.zero_grad()  # Clear previous gradients

        # Forward pass: compute predicted outputs by passing inputs to the model
        outputs = model(images, latex_encodings)

        # Calculate the loss
        # Note: You might need to adjust the target tensor's shape and content depending on your implementation
        # For example, if your model outputs log probabilities, use F.nll_loss or similar
        # Ensure the dimensions of output and targets are compatible with the chosen loss function
        loss = F.cross_entropy(outputs.view(-1, outputs.shape[-1]), latex_encodings.view(-1))
        
        loss.backward()  # Backward pass: compute gradient of the loss with respect to model parameters
        optimizer.step()  # Perform a single optimization step (parameter update)

        total_loss += loss.item()  # Update total loss

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')



{'image': tensor([[[[2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          ...,
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489]],

         [[2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          ...,
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286]],

         [[2.6400, 2.6400, 2.6400,  ..., 2.6400, 2.6400, 2.6400],
          [2.6400, 2.6400, 2.6400,  ..., 2.6400, 2.6400, 2.6400],
          [2.6400, 2.6400, 2.640

AssertionError: was expecting embedding dimension of 512, but got 131