## GPT for code dictation

In [1]:
from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, AutoModelForCausalLM, pipeline, \
                         Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset


In [2]:
MODEL = 'gpt2'

tokenizer = AutoTokenizer.from_pretrained(MODEL)  # load up a standard gpt2 model

tokenizer.pad_token = tokenizer.eos_token  # set the pad token to avoid a warning


In [3]:
data = pd.read_csv('../data/english_to_latex.csv')

print(data.shape)

data.head(2)

(50, 2)


Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"


In [4]:
data.head(10)

Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"
2,integral from negative 1 to infinity of x cubed,"\int_{-1}^{\inf} x^3 \,dx"
3,integral from 0 to infinity of x squared,"\int_{0}^{\inf} x^2 \,dx"
4,integral from 0 to infinity of y squared,"\int_{0}^{\inf} y^2 \,dy"
5,integral from 1 to 2 of x over 2,"\int_{1}^{2} \frac{x}{2} \,dx"
6,f of x equals x squared,f(x) = x^2
7,h of x equals x squared,h(x) = x^2
8,g of x equals x squared,g(x) = x^2
9,g of x equals x to the eighth power,g(x) = x^8


In [5]:
# Add our singular prompt
CONVERSION_PROMPT = 'Convert English to LaTeX\n'  # LaTeX conversion task

CONVERSION_TOKEN = 'LaTeX:'


# This is our "training prompt" that we want GPT2 to recognize and learn
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)

print(training_examples[0])


Convert English to LaTeX
English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2 \,dx


In [6]:
task_df = pd.DataFrame({'text': training_examples})

task_df.head(2)

Unnamed: 0,text
0,Convert English to LaTeX\nEnglish: integral fr...
1,Convert English to LaTeX\nEnglish: integral fr...


In [7]:
# adding the EOS token at the end so the model knows when to stop predicting

task_df['text'] = task_df['text'].map(lambda x: f'{x}{tokenizer.eos_token}')

In [8]:
latex_data = Dataset.from_pandas(task_df)  # turn a pandas DataFrame into a Dataset

def preprocess(examples):  
    # tokenize our text but don't pad because our collator will pad for us dynamically
    return tokenizer(examples['text'], truncation=True)

latex_data = latex_data.map(preprocess, batched=True)

latex_data = latex_data.train_test_split(train_size=.8)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [9]:
latex_data['train'][0]

{'text': 'Convert English to LaTeX\nEnglish: 2 pi r\nLaTeX: 2 * \\pi * r<|endoftext|>',
 'input_ids': [3103,
  1851,
  3594,
  284,
  4689,
  49568,
  198,
  15823,
  25,
  362,
  31028,
  374,
  198,
  14772,
  49568,
  25,
  362,
  1635,
  3467,
  14415,
  1635,
  374,
  50256],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [10]:
# standard data collator for auto-regressive language modelling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
latex_gpt2 = AutoModelForCausalLM.from_pretrained(MODEL)

In [12]:
latex_data

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 40
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 10
    })
})

# Attempt 1 at fine-tuning GPT2 at a LaTeX conversion task

In [13]:
training_args = TrainingArguments(
    output_dir="./english_to_latex",
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    use_mps_device=True
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mprofoz[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_loss': 2.7215285301208496,
 'eval_runtime': 0.5484,
 'eval_samples_per_second': 18.235,
 'eval_steps_per_second': 1.823}

In [14]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,0.8865,0.568916
2,0.7957,0.498212
3,0.7571,0.415177
4,0.4939,0.376287
5,0.379,0.381766


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./english_to_latex/checkpoint-40
Configuration saved in ./english_to_latex/checkpoint-40/config.json
Configuration saved in ./english_to_latex/checkpoint-40/generation_config.json
Model weights saved in ./english_to_latex/checkpoint-40/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./english_to_latex/checkpoint-80
Configuration saved in ./english_to_latex/checkpoint-80/confi

TrainOutput(global_step=200, training_loss=0.8274225550889969, metrics={'train_runtime': 85.8452, 'train_samples_per_second': 2.33, 'train_steps_per_second': 2.33, 'total_flos': 3018637440000.0, 'train_loss': 0.8274225550889969, 'epoch': 5.0})

In [60]:
book_data = TextDataset(
    tokenizer=tokenizer,
    file_path='../data/latex-guide-cos423.txt',  # train on a LaTeX cheat sheet they made
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

latex_gpt2 = AutoModelForCausalLM.from_pretrained(MODEL)

training_args = TrainingArguments(
    output_dir="./math_book",
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    use_mps_device=True
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=book_data.examples[:int(len(book_data.examples)*.8)],
    eval_dataset=book_data.examples[int(len(book_data.examples)*.8):]
)

Loading features from cached file ../data/cached_lm_GPT2TokenizerFast_128_latex-guide-cos423.txt [took 0.011 s]
loading configuration file config.json from cache at /Users/sinanozdemir/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "tas

In [61]:
trainer.evaluate()  # initial loss for the cheat sheet

***** Running Evaluation *****
  Num examples = 12
  Batch size = 32


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 2.3705315589904785,
 'eval_runtime': 8.3843,
 'eval_samples_per_second': 1.431,
 'eval_steps_per_second': 0.119}

In [62]:
trainer.train()

***** Running training *****
  Num examples = 47
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 240
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,1.8849,1.815823
2,1.7588,1.791541
3,1.4249,1.782735
4,1.2827,1.818491
5,1.2687,1.846649
6,1.0916,1.912194
7,1.0146,1.939818
8,0.7565,1.977578
9,0.7478,2.006962
10,0.729,2.001646


***** Running Evaluation *****
  Num examples = 12
  Batch size = 32
Saving model checkpoint to ./math_book/checkpoint-24
Configuration saved in ./math_book/checkpoint-24/config.json
Configuration saved in ./math_book/checkpoint-24/generation_config.json
Model weights saved in ./math_book/checkpoint-24/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 12
  Batch size = 32
Saving model checkpoint to ./math_book/checkpoint-48
Configuration saved in ./math_book/checkpoint-48/config.json
Configuration saved in ./math_book/checkpoint-48/generation_config.json
Model weights saved in ./math_book/checkpoint-48/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 12
  Batch size = 32
Saving model checkpoint to ./math_book/checkpoint-72
Configuration saved in ./math_book/checkpoint-72/config.json
Configuration saved in ./math_book/checkpoint-72/generation_config.json
Model weights saved in ./math_book/checkpoint-72/pytorch_model.bin
***** Running Evaluation *****
  N

TrainOutput(global_step=240, training_loss=1.147268941005071, metrics={'train_runtime': 98.3177, 'train_samples_per_second': 4.78, 'train_steps_per_second': 2.441, 'total_flos': 30701813760000.0, 'train_loss': 1.147268941005071, 'epoch': 10.0})

In [63]:
trainer.save_model()

Saving model checkpoint to ./math_book
Configuration saved in ./math_book/config.json
Configuration saved in ./math_book/generation_config.json
Model weights saved in ./math_book/pytorch_model.bin


# Restart training now with our own pre-trained "foundation" model

In [36]:
# load up our gpt pre-trained on latex cheat sheets
math_latex_gpt2 = AutoModelForCausalLM.from_pretrained('./math_book')

training_args = TrainingArguments(
    output_dir="./math_english_to_latex",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    use_mps_device=True
)

trainer = Trainer(
    model=math_latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()  # loss is starting slightly lower than before

loading configuration file ./math_book/config.json
Model config GPT2Config {
  "_name_or_path": "./math_book",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.30.2",
  "use_cache": true,
  "vocab_size": 50257
}

loading we

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 2.5888800621032715,
 'eval_runtime': 1.1215,
 'eval_samples_per_second': 8.917,
 'eval_steps_per_second': 0.892}

In [37]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,0.8728,0.583309
2,0.7526,0.45441
3,0.7014,0.416536
4,0.4945,0.373505
5,0.3686,0.383265


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./math_english_to_latex/checkpoint-40
Configuration saved in ./math_english_to_latex/checkpoint-40/config.json
Configuration saved in ./math_english_to_latex/checkpoint-40/generation_config.json
Model weights saved in ./math_english_to_latex/checkpoint-40/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./math_english_to_latex/checkpoint-80
Configuration saved in ./math_englis

TrainOutput(global_step=200, training_loss=0.8061065357923508, metrics={'train_runtime': 55.2758, 'train_samples_per_second': 3.618, 'train_steps_per_second': 3.618, 'total_flos': 3018637440000.0, 'train_loss': 0.8061065357923508, 'epoch': 5.0})

In [38]:
trainer.save_model()  # save this model

Saving model checkpoint to ./math_english_to_latex
Configuration saved in ./math_english_to_latex/config.json
Configuration saved in ./math_english_to_latex/generation_config.json
Model weights saved in ./math_english_to_latex/pytorch_model.bin


In [23]:
loaded_model = AutoModelForCausalLM.from_pretrained('./math_english_to_latex')
latex_generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)

text_sample = 'g of x equals integral from 0 to 1 of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams=2, early_stopping=True, temperature=0.7,
    max_new_tokens=24
)[0]['generated_text'])

loading configuration file ./math_english_to_latex/config.json
Model config GPT2Config {
  "_name_or_path": "./math_english_to_latex",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.30.2",
  "use_cache": true,
  "vocab_si

Convert English to LaTeX
English: g of x equals integral from 0 to 1 of x squared
LaTeX: g(x) = \int_{0}^{1} x^2 \,dx^2 \,dx


In [24]:
# Another example
text_sample = 'r of x is sum from 0 to x of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.30.2"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Convert English to LaTeX
English: r of x is sum from 0 to x of x squared
LaTeX: r(x) = \sum_{0}^{x} x^2 \,dx^


In [25]:
print(latex_generator(
    text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.30.2"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


r of x is sum from 0 to x of x squared
x^2 \,dx^2 \,dy^2 \,dy^2 \,dx^2 \,dy^2 \,dx^


In [78]:
# try a few shot with standard gpt2
few_shot_prompt = CONVERSION_PROMPT+"""English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx \
###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx \
###
LCT
English: pi to the 8th power
LaTeX:"""

print(non_finetuned_latex_generator(
    few_shot_prompt, num_beams=1, early_stopping=True, temperature=0.1,
    max_length=len(tokenizer.encode(few_shot_prompt)) + 20
)[0]['generated_text'])

Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.30.2"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Convert English to LaTeX
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx ###
LCT
English: pi to the 8th power
LaTeX: pi to the 8th power
LCT
English: f(x) = \sum_{


In [80]:
# Just ask with standard gpt2
print(non_finetuned_latex_generator(
    conversion_text_sample, num_beams=1, early_stopping=True, temperature=0.1,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.30.2"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Convert English to LaTeX
English: pi to the 16th power
LaTeX: pi to the 16th power
LaTeX: pi to the 16th power
LaTeX:
