In [None]:
import numpy as np
import os
import pandas as pd
import torch
from torch import nn

pd.set_option('display.max_colwidth', None)

In [None]:
base_name = "2021_Hashimoto_Neural_ODE_and_holographic_QCD_PUB"
project_folder = "diygenomics-projects"
sub_category = "math"
work_bucket = "AdS-CFT"
input_file = 'extracted_annotated_math.csv'
index_col = 'uuid'

In [None]:
data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, project_folder, sub_category, work_bucket,
                                       base_name, 'mathpix', *args)

In [None]:
df = pd.read_csv(file_path(input_file), index_col=index_col)

In [None]:
latex_corpus = df['math'].astype(str).str.cat(sep=' ')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
chars = sorted(list(set(latex_corpus)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

seq_length = 100
dataX, dataY = [], []
for i in range(0, len(latex_corpus) - seq_length, 1):
    seq_in = latex_corpus[i:i + seq_length]
    seq_out = latex_corpus[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

X = np.reshape(dataX, (len(dataX), seq_length, 1))
X = torch.from_numpy(X).float()
Y = torch.tensor(dataY).long()

# Define the model
class CharPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharPredictor, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

model = CharPredictor(input_size=1, hidden_size=256, output_size=len(chars))
model = model.to(device)
# Training
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(50):
    optimizer.zero_grad()
    output = model(X)
    loss = criterion(output, Y)
    loss.backward()
    optimizer.step()
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

In [None]:
# Prediction
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print(''.join([int_to_char[value] for value in pattern]))

for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = torch.from_numpy(x).float()
    prediction = model(x)
    _, index = torch.max(prediction, 1)
    result = int_to_char[index.item()]
    print(result, end="")
    pattern.append(index.item())
    pattern = pattern[1:len(pattern)]