In [1]:
import torch
import torch.nn as nn

class EncoderCNN(nn.Module):
    def __init__(self, layers, hparams):
        '''
        Args:
            layers: Description of all layers in the Encoder: [(layer_type, {layer_params})]
                - layer types - ['conv1d', 'conv2d', 'maxpool1d', 'maxpool2d', 'avgpool2d', 'avgpool2d', 'linear', 'dropout']
                - layer_params - dict of parameters for the layer

            hparams: Hyperparameters for the model
        '''
        super(EncoderCNN, self).__init__()
        self.hp = hparams
        self.layers = nn.ModuleList()

        for layer_type, layer_params in layers:
            if layer_type == 'conv1d':
                self.layers.append(nn.Conv1d(**layer_params))
            elif layer_type == 'conv2d':
                self.layers.append(nn.Conv2d(**layer_params))
            elif layer_type == 'maxpool1d':
                self.layers.append(nn.MaxPool1d(**layer_params))
            elif layer_type == 'maxpool2d':
                self.layers.append(nn.MaxPool2d(**layer_params))
            elif layer_type == 'avgpool1d':
                self.layers.append(nn.AvgPool1d(**layer_params))
            elif layer_type == 'avgpool2d':
                self.layers.append(nn.AvgPool2d(**layer_params))
            elif layer_type == 'linear':
                self.layers.append(nn.Linear(**layer_params))
            elif layer_type == 'dropout':
                self.layers.append(nn.Dropout(**layer_params))
            else:
                raise ValueError(f'Invalid layer type: {layer_type}')

    def forward(self, input):
        for layer in self.layers:
            input = layer(input)
        return input
    
class DecoderRNN(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, input_size):
        super(DecoderRNN, self).__init__()
        '''
        Args:
            vocabulary_size: Size of the vocabulary
            embedding_size: Size of the embedding vector
        '''
        self.vocabulary_size = vocabulary_size
        self.embedding = nn.Embedding(vocabulary_size, embedding_size)
        self.embedding_size = embedding_size
        self.lstm = nn.LSTM(input_size+embedding_size, embedding_size)
        self.output = nn.Linear(embedding_size, vocabulary_size)

        self.vocab = vocab
        self.vocab_dict = vocab_dict
        
        self.embedding = nn.Embedding(len(self.vocab), embedding_size)

    def forward(self, input, hidden):
        pass

In [2]:
# Load dataset
import torch.utils.data as data
from torchvision import transforms
import pandas as pd
from PIL import Image

def load_img(path, size = (224, 224)):
    img = (Image.open(path))
    transform = transforms.Compose([transforms.ToTensor(), transforms.Resize(size, antialias=True), transforms.Normalize(0, 255)])
    return transform(img)

class Img2LatexDataset(data.Dataset):
    def __init__(self, img_dir, formula_path, img_size = (224, 224)):
        self.data_frame = pd.read_csv(formula_path)
        self.img_dir = img_dir
        self.img_size = img_size

        self.token_to_idx = {}
        self.tokens = []

        for row in self.data_frame["formula"]:
            row = row.split()

            for token in row:
                if token not in self.token_to_idx:
                    self.token_to_idx[token] = len(self.token_to_idx)
                    self.tokens.append(token)

    def __getitem__(self, index):
        img = load_img(self.img_dir + self.data_frame["image"][index], self.img_size)
        return img, self.data_frame["formula"][index].split()

    def __len__(self):
        return len(self.data_frame)
    
    def get_vocab(self):
        vocab_dict = {}
        vocab = []

        for row in self.data_frame["formula"]:
            row = row.split()

            for token in row:
                if token not in vocab:
                    vocab_dict[token] = len(vocab)
                    vocab.append(token)

        return vocab, vocab_dict

img_dir = "../data/SyntheticData/images/"
formula_dir = "../data/SyntheticData/train.csv"

dataset = Img2LatexDataset(img_dir, formula_dir)
loader = data.DataLoader(dataset, batch_size = 1, shuffle = True)


In [3]:
hparams = {
    "lr" : 0.001,
    "batch_size" : 32,
    "epochs" : 10
}

channel_seq = [3, 32, 64, 128, 256, 512]
num_conv_pool = 5

enc_layers = []

for i in range(num_conv_pool):
    enc_layers.append(('conv2d', {'in_channels': channel_seq[i], 'out_channels': channel_seq[i+1], 'kernel_size': 5}))
    enc_layers.append(('maxpool2d', {'kernel_size': 2}))

enc_layers.append(('avgpool2d', {'kernel_size': (3,3)}))

enc = EncoderCNN(enc_layers, hparams)
dec = DecoderRNN(dataset.get_vocab(), 512)

In [7]:
# Toy training loop for Embedding

target_embeddings = torch.randn(10, 512)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(dec.embedding.parameters(), lr = 0.01)




$ \gamma _ { \Omega R , 5 } ^ { T } = - \gamma _ { \Omega R , 5 } ~ . $
