In [1]:
import numpy as np
import os
import pandas as pd
import torch
from torch import nn

pd.set_option('display.max_colwidth', None)

In [2]:
base_name = "2021_Hashimoto_Neural_ODE_and_holographic_QCD_PUB"
project_folder = "diygenomics-projects"
sub_category = "math"
work_bucket = "AdS-CFT"
input_file = 'extracted_annotated_math.csv'
index_col = 'uuid'

In [3]:
data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, project_folder, sub_category, work_bucket,
                                       base_name, 'mathpix', *args)

In [4]:
df = pd.read_csv(file_path(input_file), index_col=index_col)

In [7]:
latex_corpus = df['math'].astype(str).str.cat(sep=' ')

In [8]:
chars = sorted(list(set(latex_corpus)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

seq_length = 100
dataX, dataY = [], []
for i in range(0, len(latex_corpus) - seq_length, 1):
    seq_in = latex_corpus[i:i + seq_length]
    seq_out = latex_corpus[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

X = np.reshape(dataX, (len(dataX), seq_length, 1))
X = torch.from_numpy(X).float()
Y = torch.tensor(dataY).long()

# Define the model
class CharPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharPredictor, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

model = CharPredictor(input_size=1, hidden_size=256, output_size=len(chars))

# Training
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(50):
    optimizer.zero_grad()
    output = model(X)
    loss = criterion(output, Y)
    loss.backward()
    optimizer.step()
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

Epoch: 1, Loss: 4.255826473236084
Epoch: 2, Loss: 4.135919094085693
Epoch: 3, Loss: 4.026917457580566
Epoch: 4, Loss: 3.9249322414398193
Epoch: 5, Loss: 3.827730178833008
Epoch: 6, Loss: 3.734557628631592
Epoch: 7, Loss: 3.6460764408111572
Epoch: 8, Loss: 3.564779043197632
Epoch: 9, Loss: 3.4948737621307373
Epoch: 10, Loss: 3.4407641887664795
Epoch: 11, Loss: 3.403796672821045
Epoch: 12, Loss: 3.3796377182006836
Epoch: 13, Loss: 3.3617446422576904
Epoch: 14, Loss: 3.3465425968170166
Epoch: 15, Loss: 3.333123207092285
Epoch: 16, Loss: 3.3210601806640625
Epoch: 17, Loss: 3.309788703918457
Epoch: 18, Loss: 3.2988991737365723
Epoch: 19, Loss: 3.288224220275879
Epoch: 20, Loss: 3.277860403060913
Epoch: 21, Loss: 3.2680044174194336
Epoch: 22, Loss: 3.258833885192871
Epoch: 23, Loss: 3.2505416870117188
Epoch: 24, Loss: 3.2431299686431885
Epoch: 25, Loss: 3.236358880996704
Epoch: 26, Loss: 3.2299036979675293
Epoch: 27, Loss: 3.2234911918640137
Epoch: 28, Loss: 3.216942548751831
Epoch: 29, Loss

In [12]:
# Prediction
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print('----')
print(''.join([int_to_char[value] for value in pattern]))
print('----')

for i in range(4):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = torch.from_numpy(x).float()
    prediction = model(x)
    _, index = torch.max(prediction, 1)
    result = int_to_char[index.item()]
    print(result, end="")
    pattern.append(index.item())
    pattern = pattern[1:len(pattern)]

Seed:
----
ac{\mathrm{d} z(t)}{\mathrm{d} t}=f_{\theta}(z(t), t)
\] \[
\mathcal{L}\left(z_{1}\right)=\mathcal{L
----
}a{a