In [1]:
import pandas as pd

In [2]:
examples = [
    ('integral from a to b of x squared', '\\int_{a}^{b} x^2 \,dx'),
    ('integral from negative 1 to 1 of x squared', '\\int_{-1}^{1} x^2 \,dx'),
    ('integral from negative 1 to infinity of x cubed', '\\int_{-1}^{\\inf} x^3 \,dx'),
    ('integral from 0 to infinity of x squared', '\\int_{0}^{\\inf} x^2 \,dx'),
    ('integral from 0 to infinity of y squared', '\\int_{0}^{\\inf} y^2 \,dy'),
    ('f of x equals x squared', 'f(x) = x^2'),
    ('h of x equals x squared', 'h(x) = x^2'),
    ('g of x equals x squared', 'g(x) = x^2'),
    ('g of x equals x to the eigth power', 'g(x) = x^8'),
    ('f of x equals x cubed', 'f(x) = x^3'),
    ('f of x equals x', 'f(x) = x'),
    ('f of x equals x over n', 'f(x) = \\frac{x}{n}'),
    ('f of x equals sum from 1 to 5 of x squared', 'f(x) = \\sum_{1}^{5} x^2'),
    ('x squared', 'x^2'),
    ('x cubed', 'x^3'),
    ('pi squared', '\\pi^2'),
    ('z squared', 'z^2'),
    ('z over x squared', '\\frac{z}{x^2}'),
    ('f of x equals x squared', 'f(x) = x^2'),
    ('1 over 6', '\\frac{1}{6}'),
    ('2 pi', '2 * \\pi'),
    ('s cubed', 's^3'),
    ('s to the sixth power', 's^6'),
    ('2 pi r', '2 * \\pi * r'),
    ('pi over n', '\\frac{\\pi}{n}'),
    ('f of n equals pi over n', 'f(n) = \\frac{\\pi}{n}'),
    ('pi times x', '\\pi*x'),
    ('pi to the fourth power', '\\pi^4'),
    ('pi to the fifth power', '\\pi^5'),
    ('1 over n', '\\frac{1}{n}'),
    ('x squared over n', '\\frac{x^2}{n}'),
    ('y squared over x^2', '\\frac{y^2}{x^2}'),
    ('1 over 7 to the 7th power', '(\\frac{1}{7})^7'),
    ('sum from i to n of X i', '\\sum_{i}^{n} X_i'),
    ('sum from 0 to n of 77 n', '\\sum_{0}^{n} 77 * n'),
    ('sum from 0 to 5 of x', '\\sum_{0}^{5} x'),
    ('sum from 1 to x of x', '\\sum_{1}^{x} x'),
    ('sum from 1 to x of x squared', '\\sum_{1}^{x} x^2'),
    ('sum from 1 to 10 of pi squared', '\\sum_{1}^{10} \\pi^2')
]

In [3]:
data = pd.DataFrame(examples, columns=['English', 'LaTeX'])

data.to_csv('../data/english_to_latex.csv', index=False)

data = pd.read_csv('../data/english_to_latex.csv')

data.shape

(39, 2)

In [4]:
from transformers import GPT2Tokenizer

MODEL = 'distilgpt2'

tokenizer = GPT2Tokenizer.from_pretrained(MODEL)

tokenizer.pad_token = tokenizer.eos_token

#add two prompts, one for each task
CONVERSION_PROMPT = 'LCT\n'  # LaTeX conversion task

CONVERSION_TOKEN = 'LaTeX:'


INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [5]:
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)

print(training_examples[0])


LCT
English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2 \,dx


In [6]:
task_df = pd.DataFrame({'text': training_examples})

task_df.shape

(39, 1)

In [7]:
from datasets import Dataset
data = Dataset.from_pandas(task_df)
data

Dataset({
    features: ['text'],
    num_rows: 39
})

In [8]:
MAX_TOKENS = task_df['text'].apply(lambda x: len(tokenizer(x)['input_ids'])).max() + 5

MAX_TOKENS

42

In [9]:
# texts to numeric vectors of MAX_TOKENS
def tokenize_function(examples):
    # tokenizer created input_ids and attention_mask as output
    output = tokenizer(
        examples['text'],
        add_special_tokens=True,
        max_length=MAX_TOKENS,
        truncation=True,
        padding='max_length',
    )
    
    output['labels'] = output["input_ids"]
    # -100 is a reserved value to ignore these tokens when calculating the loss
    output["labels"] = [[-100 if x == tokenizer.pad_token_id else x for x in y] for y in output["labels"]]
    return output

data = data.map(
    tokenize_function,
    batched=True,
)
print(data)


  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'text'],
    num_rows: 39
})


In [10]:
data.set_format(type="python", columns=["input_ids", "attention_mask", "labels"])

data = data.train_test_split(test_size=0.20, shuffle=True, seed=0)
print(data)

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text'],
        num_rows: 31
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text'],
        num_rows: 8
    })
})


In [11]:
tokenizer.decode(data['train'][0]['input_ids'])

'LCT\nEnglish: sum from 1 to x of x squared\nLaTeX: \\sum_{1}^{x} x^2<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [12]:
tokenizer.decode([c for c  in data['train'][0]['labels'] if c != -100])

'LCT\nEnglish: sum from 1 to x of x squared\nLaTeX: \\sum_{1}^{x} x^2'

In [13]:
from transformers import Trainer, TrainingArguments
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained(MODEL)

In [14]:
training_args = TrainingArguments(
    output_dir="./english_to_latex", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    load_best_model_at_end=True,
    warmup_steps=len(data['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data['test'],
)

In [15]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 8
  Batch size = 4


{'eval_loss': 5.294892311096191,
 'eval_runtime': 0.6999,
 'eval_samples_per_second': 11.429,
 'eval_steps_per_second': 2.857}

In [16]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running training *****
  Num examples = 31
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 80


Epoch,Training Loss,Validation Loss
1,3.3655,3.565032
2,2.413,2.395256
3,1.2343,1.830489
4,1.0924,1.555002
5,0.7113,1.440239
6,1.0988,1.346469
7,0.835,1.327629
8,1.0524,1.305114
9,0.7545,1.267641
10,0.7192,1.255913


The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 8
  Batch size = 4
Saving model checkpoint to ./english_to_latex/checkpoint-8
Configuration saved in ./english_to_latex/checkpoint-8/config.json
Model weights saved in ./english_to_latex/checkpoint-8/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 8
  Batch size = 4
Saving model checkpoint to ./english_to_latex/checkpoint-16
Configuration saved in ./english_to_latex/checkpoint-16/config.json
Model weights saved in ./english_to_latex/checkpoint-16/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 

TrainOutput(global_step=80, training_loss=1.550399024412036, metrics={'train_runtime': 131.5715, 'train_samples_per_second': 2.356, 'train_steps_per_second': 0.608, 'total_flos': 3322347356160.0, 'train_loss': 1.550399024412036, 'epoch': 10.0})

In [17]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 8
  Batch size = 4


{'eval_loss': 1.255913496017456,
 'eval_runtime': 0.6829,
 'eval_samples_per_second': 11.715,
 'eval_steps_per_second': 2.929,
 'epoch': 10.0}

In [18]:
trainer.save_model()

Saving model checkpoint to ./english_to_latex
Configuration saved in ./english_to_latex/config.json
Model weights saved in ./english_to_latex/pytorch_model.bin


In [19]:
from transformers import pipeline

loaded_model = GPT2LMHeadModel.from_pretrained('./english_to_latex')

loading configuration file ./english_to_latex/config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": 

In [20]:
# text_sample = '1 over x'
text_sample = 'g of x equals sum from 1 to n of x'
text_sample = 'sum from 10 to 100 of x'

conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(conversion_text_sample)

LCT
English: sum from 10 to 100 of x
LaTeX:


In [21]:
encoded_input = tokenizer(conversion_text_sample, return_tensors='pt')

print(
    tokenizer.decode(loaded_model.generate(
        input_ids=encoded_input['input_ids'],
        num_beams=3,
        max_length=len(encoded_input['input_ids'][0]) + MAX_TOKENS,
        temperature=1,
        top_k=20,
        repetition_penalty=2.,
        early_stopping=True
    )[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: sum from 10 to 100 of x
LaTeX: \sum_{10}^{10} x^2 \,dx^2 \,dx^2 \,dx^2 \,dx^2 \,dx^2 \,dx^2 \
