In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/english_to_latex.csv')

data.shape

(50, 2)

In [3]:
data.head()

Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"
2,integral from negative 1 to infinity of x cubed,"\int_{-1}^{\inf} x^3 \,dx"
3,integral from 0 to infinity of x squared,"\int_{0}^{\inf} x^2 \,dx"
4,integral from 0 to infinity of y squared,"\int_{0}^{\inf} y^2 \,dy"


In [4]:
# Prompt Engineering
from transformers import GPT2Tokenizer

MODEL = 'distilgpt2'

tokenizer = GPT2Tokenizer.from_pretrained(MODEL)

tokenizer.pad_token = tokenizer.eos_token

#add two prompts, one for each task
CONVERSION_PROMPT = 'LCT\n'  # LaTeX conversion task

CONVERSION_TOKEN = 'LaTeX:'


INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [6]:
# This is our "training prompt" that we want GPT2 to recognize and learn
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)

print(training_examples[0])


LCT
English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2 \,dx


In [7]:
task_df = pd.DataFrame({'text': training_examples})

task_df.shape

(50, 1)

In [8]:
from datasets import Dataset
data = Dataset.from_pandas(task_df)
data

Dataset({
    features: ['text'],
    num_rows: 50
})

In [9]:
MAX_TOKENS = task_df['text'].apply(lambda x: len(tokenizer(x)['input_ids'])).max() + 5

MAX_TOKENS

50

In [10]:
# tokenizer created input_ids and attention_mask as output
def tokenize_function(examples):
    output = tokenizer(
        examples['text'],
        add_special_tokens=True,
        max_length=MAX_TOKENS,
        truncation=True,
        padding='max_length',
    )
    
    output['labels'] = output["input_ids"]
    # -100 is a reserved value to ignore these tokens when calculating the loss
    output["labels"] = [[-100 if x == tokenizer.pad_token_id else x for x in y] for y in output["labels"]]
    return output

data = data.map(
    tokenize_function,
    batched=True,
)
print(data)


  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'text'],
    num_rows: 50
})


In [11]:
data.set_format(type="python", columns=["input_ids", "attention_mask", "labels"])

data = data.train_test_split(test_size=0.10, shuffle=True, seed=0)
print(data)

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text'],
        num_rows: 45
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text'],
        num_rows: 5
    })
})


In [12]:
tokenizer.decode(data['train'][0]['input_ids'])

'LCT\nEnglish: integral from 0 to infinity of x squared\nLaTeX: \\int_{0}^{\\inf} x^2 \\,dx<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [13]:
tokenizer.decode([c for c  in data['train'][0]['labels'] if c != -100])

'LCT\nEnglish: integral from 0 to infinity of x squared\nLaTeX: \\int_{0}^{\\inf} x^2 \\,dx'

In [14]:
from transformers import Trainer, TrainingArguments
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained(MODEL)

In [13]:
# Note the batch size of 4 to make sure we have multiple steps per epoch. This generally speeds up training
training_args = TrainingArguments(
    output_dir="./english_to_latex", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=15, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    load_best_model_at_end=True,
    warmup_steps=len(data['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data['test'],
)

In [14]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 4


{'eval_loss': 5.044375896453857,
 'eval_runtime': 0.6355,
 'eval_samples_per_second': 7.867,
 'eval_steps_per_second': 3.147}

In [15]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running training *****
  Num examples = 45
  Num Epochs = 15
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 180


Epoch,Training Loss,Validation Loss
1,2.4989,2.972403
2,1.4366,1.703806
3,2.0213,1.349075
4,0.6283,1.088507
5,0.7219,1.061508
6,0.9939,0.888385
7,0.3486,0.887767
8,0.5303,0.817387
9,0.37,0.768076
10,0.2086,0.755947


The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 4
Saving model checkpoint to ./english_to_latex/checkpoint-12
Configuration saved in ./english_to_latex/checkpoint-12/config.json
Model weights saved in ./english_to_latex/checkpoint-12/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 4
Saving model checkpoint to ./english_to_latex/checkpoint-24
Configuration saved in ./english_to_latex/checkpoint-24/config.json
Model weights saved in ./english_to_latex/checkpoint-24/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples

TrainOutput(global_step=180, training_loss=0.9498728894525104, metrics={'train_runtime': 311.5343, 'train_samples_per_second': 2.167, 'train_steps_per_second': 0.578, 'total_flos': 8612075520000.0, 'train_loss': 0.9498728894525104, 'epoch': 15.0})

In [16]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 4


{'eval_loss': 0.7040641903877258,
 'eval_runtime': 0.5663,
 'eval_samples_per_second': 8.829,
 'eval_steps_per_second': 3.532,
 'epoch': 15.0}

In [17]:
trainer.save_model()

Saving model checkpoint to ./english_to_latex
Configuration saved in ./english_to_latex/config.json
Model weights saved in ./english_to_latex/pytorch_model.bin


In [15]:
# Load our finetuned model
loaded_model = GPT2LMHeadModel.from_pretrained('./english_to_latex')

In [16]:
text_sample = 'f of x equals integral from 0 to 1 of x'

conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(conversion_text_sample)

LCT
English: f of x equals integral from 0 to 1 of x
LaTeX:


In [17]:
encoded_input = tokenizer(conversion_text_sample, return_tensors='pt')

print(
    tokenizer.decode(loaded_model.generate(
        input_ids=encoded_input['input_ids'],
        num_beams=3,
        max_length=MAX_TOKENS,
        temperature=1,
        top_k=10,
        early_stopping=True
    )[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x equals integral from 0 to 1 of x
LaTeX: f(x) = \int_{0}^{1} x \,dx \,dx \,dx \,dx \,dx \
