In [16]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer, GPT2LMHeadModel, pipeline

from datasets import Dataset
import pandas as pd

In [3]:
data = pd.read_csv("BERT_LLM/BERT_LLM/data/data/english_to_latex.csv")

print(data.shape)

data.head(2)

(50, 2)


Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"


In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

# Add our singular prompt
CONVERSION_PROMPT = 'LCT\n' # LaTex conversion task

CONVERSION_TOKEN = 'LaTex:'

In [6]:
# This is our "training prompt" that we want GPT2 to recognize and learn
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)

print(training_examples[0])

LCT
English: integral from a to b of x squared
LaTex: \int_{a}^{b} x^2 \,dx


In [7]:
task_df = pd.DataFrame({'text': training_examples})

task_df.head(2)

Unnamed: 0,text
0,LCT\nEnglish: integral from a to b of x square...
1,LCT\nEnglish: integral from negative 1 to 1 of...


In [8]:
latex_data = Dataset.from_pandas(task_df) # Turn a pandas DataFrame into a Dataset

def preprocess(examples): # tokenize our text but don't pad because our collator will pad for us dynamically
    return tokenizer(examples['text'], truncation = True)

latex_data = latex_data.map(preprocess, batched = True)

latex_data = latex_data.train_test_split(train_size = .8)

Map: 100%|██████████| 50/50 [00:00<00:00, 2039.55 examples/s]


In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

In [11]:
latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

In [12]:
training_args = TrainingArguments(
    output_dir = "./english_to_latex", # The output directory
    overwrite_output_dir = True, # Overwrite the content of the output directory
    num_train_epochs = 10, # Number of training epochs
    per_device_train_batch_size = 2, # Batch size for training
    per_device_eval_batch_size = 20, # Batch size for evaluation
    logging_steps = 5,
    load_best_model_at_end = True,
    log_level = 'info',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch'
)

trainer = Trainer(
    model = latex_gpt2,
    args = training_args,
    data_collator = data_collator,
    train_dataset = latex_data["train"],
    eval_dataset = latex_data["test"]
)

trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'eval_loss': 5.226635456085205,
 'eval_runtime': 3.2432,
 'eval_samples_per_second': 3.083,
 'eval_steps_per_second': 0.308}

In [13]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,1.6347,1.035496
2,0.8406,0.650813
3,0.6569,0.588198
4,0.5495,0.555084
5,0.5217,0.597736
6,0.5328,0.602794
7,0.4358,0.64669
8,0.4299,0.639935
9,0.3549,0.60142
10,0.4075,0.609543


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./english_to_latex/tmp-checkpoint-20
Configuration saved in ./english_to_latex/tmp-checkpoint-20/config.json
Configuration saved in ./english_to_latex/tmp-checkpoint-20/generation_config.json
Model weights saved in ./english_to_latex/tmp-checkpoint-20/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./english_to_latex/tmp-checkpoint-40
Configuration saved in ./english_to_latex

TrainOutput(global_step=200, training_loss=0.7961916118860245, metrics={'train_runtime': 95.0186, 'train_samples_per_second': 4.21, 'train_steps_per_second': 2.105, 'total_flos': 6014820096000.0, 'train_loss': 0.7961916118860245, 'epoch': 10.0})

In [14]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'eval_loss': 0.5550838708877563,
 'eval_runtime': 0.0362,
 'eval_samples_per_second': 275.968,
 'eval_steps_per_second': 27.597,
 'epoch': 10.0}

In [None]:
# Let's try fine tunning it again but first let's let the model read a calculus book

In [17]:
# Calculus Made Easy by Silvanus P. Thompson = https://gutenberg.org/ebooks/33283

calculus_data = TextDataset(
    tokenizer = tokenizer,
    file_path = "BERT_LLM/BERT_LLM/data/data/calculus made easy.txt", # Principles of Data Science - Sinan Ozdemir
    block_size = 32
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm = False)

latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir = "./calculus", # The output directory
    overwrite_output_dir = True, # Overwrite the content of the output directory
    num_train_epochs = 1, # Number of training epochs
    per_device_train_batch_size = 32, # Batch size for training
    per_device_eval_batch_size = 32, # Batch size for evaluation
    logging_steps = 50,
    eval_steps = 50,
    load_best_model_at_end = True,
    evaluation_strategy = 'steps',
    save_strategy = 'steps'
)

trainer = Trainer(
    model = latex_gpt2,
    args = training_args,
    data_collator = data_collator,
    train_dataset = calculus_data.examples[:int(len(calculus_data.examples)*.8)],
    eval_dataset = calculus_data.examples[int(len(calculus_data.examples)*.8):]
)

trainer.evaluate()

Loading features from cached file BERT_LLM/BERT_LLM/data/data/cached_lm_GPT2Tokenizer_32_calculus made easy.txt [took 0.008 s]


loading configuration file config.json from cache at /home/randi_eka/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.36.2",
 

{'eval_loss': 2.5129199028015137,
 'eval_runtime': 3.4067,
 'eval_samples_per_second': 476.702,
 'eval_steps_per_second': 14.97}

In [18]:
trainer.train()

***** Running training *****
  Num examples = 6,494
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 203
  Number of trainable parameters = 124,439,808


Step,Training Loss,Validation Loss
50,1.8051,1.647683
100,1.5672,1.591995
150,1.5423,1.567939
200,1.4854,1.56013


***** Running Evaluation *****
  Num examples = 1624
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1624
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1624
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1624
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=203, training_loss=1.5984177824311656, metrics={'train_runtime': 61.648, 'train_samples_per_second': 105.34, 'train_steps_per_second': 3.293, 'total_flos': 106051903488000.0, 'train_loss': 1.5984177824311656, 'epoch': 1.0})

In [19]:
trainer.save_model()

Saving model checkpoint to ./calculus
Configuration saved in ./calculus/config.json
Configuration saved in ./calculus/generation_config.json
Model weights saved in ./calculus/pytorch_model.bin


In [20]:
calculus_latex_gpt2 = GPT2LMHeadModel.from_pretrained('./calculus') # load up our gpt pre-trained on calculus

training_args = TrainingArguments(
    output_dir = "./calculus_english_to_latex", # The output directory
    overwrite_output_dir = True, # Overwrite the content of the output directory
    num_train_epochs = 10, # Number of training epochs
    per_device_train_batch_size = 2, # Batch size for training
    per_device_eval_batch_size = 20, # Batch size for evaluation
    logging_steps = 5,
    load_best_model_at_end = True,
    log_level = 'info',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch'
)

trainer = Trainer(
    model = calculus_latex_gpt2,
    args = training_args,
    data_collator = data_collator,
    train_dataset = latex_data["train"],
    eval_dataset = latex_data["test"]
)

trainer.evaluate()

loading configuration file ./calculus/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights fi

{'eval_loss': 4.970127105712891,
 'eval_runtime': 0.034,
 'eval_samples_per_second': 293.997,
 'eval_steps_per_second': 29.4}

In [21]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,1.483,0.915031
2,0.7679,0.582977
3,0.6296,0.533364
4,0.5552,0.579894
5,0.5134,0.580286
6,0.547,0.560739
7,0.4812,0.605342
8,0.4033,0.584355
9,0.3721,0.575655
10,0.3842,0.579917


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./calculus_english_to_latex/tmp-checkpoint-20
Configuration saved in ./calculus_english_to_latex/tmp-checkpoint-20/config.json
Configuration saved in ./calculus_english_to_latex/tmp-checkpoint-20/generation_config.json
Model weights saved in ./calculus_english_to_latex/tmp-checkpoint-20/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./calculus_english_to_latex/tmp-checkpoint

TrainOutput(global_step=200, training_loss=0.7445821130275726, metrics={'train_runtime': 119.6173, 'train_samples_per_second': 3.344, 'train_steps_per_second': 1.672, 'total_flos': 6014820096000.0, 'train_loss': 0.7445821130275726, 'epoch': 10.0})

In [22]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'eval_loss': 0.5333635210990906,
 'eval_runtime': 0.0402,
 'eval_samples_per_second': 248.907,
 'eval_steps_per_second': 24.891,
 'epoch': 10.0}

In [23]:
trainer.save_model()

Saving model checkpoint to ./calculus_english_to_latex
Configuration saved in ./calculus_english_to_latex/config.json
Configuration saved in ./calculus_english_to_latex/generation_config.json
Model weights saved in ./calculus_english_to_latex/pytorch_model.bin


In [24]:
loaded_model = GPT2LMHeadModel.from_pretrained('./calculus_english_to_latex')
latex_generator = pipeline('text-generation', model = loaded_model, tokenizer = tokenizer)

loading configuration file ./calculus_english_to_latex/config.json
Model config GPT2Config {
  "_name_or_path": "./calculus",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 5025

In [25]:
text_sample = 'f of x equals integral from 0 to pi of x to the fourth power'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(conversion_text_sample)

LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTex:


In [34]:
print(latex_generator(
    conversion_text_sample, num_beams = 5, early_stopping = True, temperature = 0.7,
    max_length = len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTex: f(x) = \int_{0}^{pi} x^3 \,dx \


In [36]:
text_sample = 'f of x is sum from 0 to x of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams = 5, early_stopping = True, temperature = 0.7,
    max_length = len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x is sum from 0 to x of x squared
LaTex: f(x) = \sum_{0}^{x} x^2 \,dx^


In [37]:
# Sanity check that a non-finetuned model could not have done this

non_finetuned_latex_generator = pipeline(
    'text-generation',
    model = GPT2LMHeadModel.from_pretrained('gpt2'), #not fine-tuned!
    tokenizer = tokenizer
)

loading configuration file config.json from cache at /home/randi_eka/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.36.2",
 

In [39]:
few_shot_prompt = """LCT
English: f of x is sum from 0 to x of x squared
LaTex: f(x) = \sum_{0}^{x} x^2 \,dx \
###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTex: f(x) = \int_{0}^{pi} x^4 \,dx \
###
LCT
English: x squared
LaTeX:"""

In [40]:
print(non_finetuned_latex_generator(
    few_shot_prompt, num_beams = 5, early_stopping = True, temperature = 0.7,
    max_length = len(tokenizer.encode(few_shot_prompt)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x is sum from 0 to x of x squared
LaTex: f(x) = \sum_{0}^{x} x^2 \,dx ###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTex: f(x) = \int_{0}^{pi} x^4 \,dx ###
LCT
English: x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###


In [38]:
print(non_finetuned_latex_generator(
    conversion_text_sample, num_beams = 5, early_stopping = True, temperature = 0.7,
    max_length = len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LCT
English: f of x is sum from 0 to x of x squared
LaTex: f of x is sum from 0 to x of x squared
LCT

English: f
