## GPT for code dictation

In [None]:
!pip install evaluate transformers[torch] datasets



In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer, \
                         AutoModelForCausalLM, pipeline
from datasets import Dataset
import pandas as pd

In [None]:
data = pd.read_csv('english_to_latex.csv')

print(data.shape)

data.head(2)

(50, 2)


Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2\,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2\,dx"


In [None]:
MODEL = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# avoids a warning in our transformers code
tokenizer.pad_token = tokenizer.eos_token

# Add our prompt to help guide GPT on what to do
CONVERSION_PROMPT = 'English to LaTeX\n\n'  # LaTeX conversion task

CONVERSION_TOKEN = 'LaTeX:'


In [None]:
# This is our "training prompt" that we want GPT2 to recognize and learn
training_examples = (f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'] + tokenizer.eos_token).astype(str)

print(training_examples[0])


English to LaTeX

English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2\,dx<|endoftext|>


## From GPT-3-Turbo-Instruct
![](../images/gpt3latex.png)

In [None]:
# At inference time, the input to our model would look like this
print(f"""{CONVERSION_PROMPT}English: integral from a to b of x squared
LaTeX:""")

English to LaTeX

English: integral from a to b of x squared
LaTeX:


In [None]:
task_df = pd.DataFrame({'text': training_examples})

task_df.head(2)

Unnamed: 0,text
0,English to LaTeX\n\nEnglish: integral from a t...
1,English to LaTeX\n\nEnglish: integral from neg...


In [None]:
print(task_df['text'].loc[0])

English to LaTeX

English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2\,dx<|endoftext|>


In [None]:
latex_data = Dataset.from_pandas(task_df)  # turn a pandas DataFrame into a Dataset

def preprocess(examples):  # tokenize our text but don't pad because our collator will pad for us dynamically
    return tokenizer(examples['text'], truncation=True)

latex_data = latex_data.map(preprocess, batched=True)

latex_data = latex_data.train_test_split(train_size=.7)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
latex_data['train'][0]

{'text': 'English to LaTeX\n\nEnglish: pi over n\nLaTeX: \\frac{\\pi}{n}<|endoftext|>',
 'input_ids': [15823,
  284,
  4689,
  49568,
  198,
  198,
  15823,
  25,
  31028,
  625,
  299,
  198,
  14772,
  49568,
  25,
  3467,
  31944,
  31478,
  14415,
  18477,
  77,
  92,
  50256],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
    )

In [None]:
# We could also try other Causal models like GPT-J or GPT-Neo
latex_gpt = AutoModelForCausalLM.from_pretrained(MODEL, trust_remote_code=True)

In [None]:
latex_gpt

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
training_args = TrainingArguments(
    output_dir="./english_to_latex",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch',

    # deep learning params
    learning_rate=2e-4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=latex_gpt,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator
)

trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 15
  Batch size = 20
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 4.3698530197143555,
 'eval_runtime': 0.8265,
 'eval_samples_per_second': 18.15,
 'eval_steps_per_second': 1.21}

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 35
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 350
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,1.3329,1.079309
2,0.6696,0.820034
3,0.7265,0.822533
4,0.2934,0.888356
5,0.4184,0.929315
6,0.3021,0.794286
7,0.3303,0.843802
8,0.2372,0.901237
9,0.2267,0.941106
10,0.272,0.944768


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 15
  Batch size = 20


Saving model checkpoint to ./english_to_latex/checkpoint-35
Configuration saved in ./english_to_latex/checkpoint-35/config.json
Configuration saved in ./english_to_latex/checkpoint-35/generation_config.json
Model weights saved in ./english_to_latex/checkpoint-35/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 15
  Batch size = 20
Saving model checkpoint to ./english_to_latex/checkpoint-70
Configuration saved in ./english_to_latex/checkpoint-70/config.json
Configuration saved in ./english_to_latex/checkpoint-70/generation_config.json
Model weights saved in ./english_to_latex/checkpoint-70/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. I

TrainOutput(global_step=350, training_loss=0.47907389572688514, metrics={'train_runtime': 143.9192, 'train_samples_per_second': 2.432, 'train_steps_per_second': 2.432, 'total_flos': 5195220480000.0, 'train_loss': 0.47907389572688514, 'epoch': 10.0})

In [None]:
trainer.evaluate()  # best loss of 0.323

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 15
  Batch size = 20


{'eval_loss': 0.7942858338356018,
 'eval_runtime': 0.0994,
 'eval_samples_per_second': 150.964,
 'eval_steps_per_second': 10.064,
 'epoch': 10.0}

In [None]:
# Let's try fine-tuning it again but first let's have the model read some latex tutorials first

In [None]:
from tqdm import tqdm

In [None]:
from bs4 import BeautifulSoup
import requests

# URL to be crawled
base_url = "https://www.overleaf.com/learn"

# Sending a request to the URL
response = requests.get(base_url)

latex_tutorials = ''

# Checking if the request was successful
if response.status_code == 200:
    # Parsing the content of the page with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Finding all anchor tags with href attributes
    links = soup.find_all('a', href=True)

    # Extracting the URLs from the anchor tags
    urls = [link['href'] for link in links]
    urls = [base_url.replace('/learn', '') + u for u in urls if u.startswith('/') and 'latex' in u]

    # Printing the URLs for demonstration
    for u in tqdm(urls):
        latex_tutorials += BeautifulSoup(
            requests.get(u).text, 'html.parser').get_text()+'\n\n'
else:
    print("Failed to retrieve the webpage.")


100%|██████████| 126/126 [00:33<00:00,  3.73it/s]


In [None]:
# latex_tutorial = BeautifulSoup(
#     requests.get('https://www.overleaf.com/learn/latex/Learn_LaTeX_in_30_minutes').text, 'html.parser'
# ).get_text()

with open('latex_tutorials.txt', 'w') as f:
    f.write(latex_tutorials)


In [None]:
from transformers import TextDataset

pre_training_data = TextDataset(
    tokenizer=tokenizer,
    file_path='latex_tutorials.txt',
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

latex_gpt2 = AutoModelForCausalLM.from_pretrained(MODEL)

training_args = TrainingArguments(
    output_dir="./pre_trained",
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=1,
    eval_steps=50,
    evaluation_strategy='epoch',
    save_strategy='epoch',

    # deep learning params
    learning_rate=2e-4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pre_training_data.examples[:int(len(pre_training_data.examples)*.8)],
    eval_dataset=pre_training_data.examples[int(len(pre_training_data.examples)*.8):]
)

Loading features from cached file cached_lm_GPT2TokenizerFast_128_latex_tutorials.txt [took 0.014 s]
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
   

In [None]:
trainer.evaluate()  # initial loss

***** Running Evaluation *****
  Num examples = 365
  Batch size = 32


{'eval_loss': 3.89501953125,
 'eval_runtime': 3.9252,
 'eval_samples_per_second': 92.99,
 'eval_steps_per_second': 3.057}

In [None]:
trainer.train()

***** Running training *****
  Num examples = 1,456
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 230
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,1.8169,2.183348
2,1.5151,2.114973
3,1.208,2.11754
4,0.8294,2.146433
5,1.0541,2.16314


***** Running Evaluation *****
  Num examples = 365
  Batch size = 32


Saving model checkpoint to ./pre_trained/checkpoint-46
Configuration saved in ./pre_trained/checkpoint-46/config.json
Configuration saved in ./pre_trained/checkpoint-46/generation_config.json
Model weights saved in ./pre_trained/checkpoint-46/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 365
  Batch size = 32
Saving model checkpoint to ./pre_trained/checkpoint-92
Configuration saved in ./pre_trained/checkpoint-92/config.json
Configuration saved in ./pre_trained/checkpoint-92/generation_config.json
Model weights saved in ./pre_trained/checkpoint-92/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 365
  Batch size = 32
Saving model checkpoint to ./pre_trained/checkpoint-138
Configuration saved in ./pre_trained/checkpoint-138/config.json
Configuration saved in ./pre_trained/checkpoint-138/generation_config.json
Model weights saved in ./pre_trained/checkpoint-138/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 365
  Batch size = 32
Sav

TrainOutput(global_step=230, training_loss=1.403594895290292, metrics={'train_runtime': 335.303, 'train_samples_per_second': 21.712, 'train_steps_per_second': 0.686, 'total_flos': 475551498240000.0, 'train_loss': 1.403594895290292, 'epoch': 5.0})

In [None]:
trainer.save_model()  #  A decent drop in loss

Saving model checkpoint to ./pre_trained
Configuration saved in ./pre_trained/config.json
Configuration saved in ./pre_trained/generation_config.json
Model weights saved in ./pre_trained/pytorch_model.bin


In [None]:
pre_trained_latex_gpt2 = AutoModelForCausalLM.from_pretrained('./pre_trained')  # load up our gpt pre-trained on open data

training_args = TrainingArguments(
    output_dir="./pre_trained_english_to_latex",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch',

    # deep learning params
    learning_rate=2e-4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=pre_trained_latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()  # loss is starting slightly lower than before

loading configuration file ./pre_trained/config.json
Model config GPT2Config {
  "_name_or_path": "./pre_trained",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.35.2",
  "use_cache": true,
  "vocab_size": 50257
}

loadin

{'eval_loss': 4.062228202819824,
 'eval_runtime': 0.1066,
 'eval_samples_per_second': 140.707,
 'eval_steps_per_second': 9.38}

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 35
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 350
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,1.2157,0.951785
2,0.5345,0.865575
3,0.6183,0.763343
4,0.2933,0.886332
5,0.3947,0.831877
6,0.2215,0.803732
7,0.288,0.852185
8,0.2251,0.931376
9,0.1964,0.973966
10,0.3312,0.99792


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 15
  Batch size = 20


Saving model checkpoint to ./pre_trained_english_to_latex/checkpoint-35
Configuration saved in ./pre_trained_english_to_latex/checkpoint-35/config.json
Configuration saved in ./pre_trained_english_to_latex/checkpoint-35/generation_config.json
Model weights saved in ./pre_trained_english_to_latex/checkpoint-35/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 15
  Batch size = 20
Saving model checkpoint to ./pre_trained_english_to_latex/checkpoint-70
Configuration saved in ./pre_trained_english_to_latex/checkpoint-70/config.json
Configuration saved in ./pre_trained_english_to_latex/checkpoint-70/generation_config.json
Model weights saved in ./pre_trained_english_to_latex/checkpoint-70/pytorch_model.bin
The following columns in the evaluation set

TrainOutput(global_step=350, training_loss=0.4374154887880598, metrics={'train_runtime': 182.3335, 'train_samples_per_second': 1.92, 'train_steps_per_second': 1.92, 'total_flos': 5195220480000.0, 'train_loss': 0.4374154887880598, 'epoch': 10.0})

In [None]:
trainer.evaluate()  # pre-training on the tutorial for one epoch led to a minor drop in loss

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 15
  Batch size = 20


{'eval_loss': 0.7633429169654846,
 'eval_runtime': 0.1001,
 'eval_samples_per_second': 149.864,
 'eval_steps_per_second': 9.991,
 'epoch': 10.0}

In [None]:
trainer.save_model()  # save this model

Saving model checkpoint to ./pre_trained_english_to_latex
Configuration saved in ./pre_trained_english_to_latex/config.json
Configuration saved in ./pre_trained_english_to_latex/generation_config.json
Model weights saved in ./pre_trained_english_to_latex/pytorch_model.bin


In [None]:
loaded_model = AutoModelForCausalLM.from_pretrained('./pre_trained_english_to_latex')
latex_generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)

loading configuration file ./pre_trained_english_to_latex/config.json
Model config GPT2Config {
  "_name_or_path": "./pre_trained_english_to_latex",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.35.2",
  "use_cache": tru

In [None]:
text_sample = 'g of x equals integral from 0 to pi of x'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(conversion_text_sample)

English to LaTeX

English: g of x equals integral from 0 to pi of x
LaTeX:


In [None]:
print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7, eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


English to LaTeX

English: g of x equals integral from 0 to pi of x
LaTeX: g(x) = \int_{0}^{pi} x\,dx\,dx


In [None]:
print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7, eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


English to LaTeX

English: g of x equals integral from 0 to pi of x
LaTeX: g(x) = \int_{0}^{pi} x\,dx\,dx


In [None]:
print(text_sample)

g of x equals integral from 0 to pi of x


In [None]:
print(conversion_text_sample)

English to LaTeX

English: g of x equals integral from 0 to pi of x
LaTeX:


In [None]:
print(latex_generator(
    text_sample,
    num_beams=3, early_stopping=True,
    eos_token_id=tokenizer.eos_token_id, max_new_tokens=20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


r of x is the sum from 0 to x of x squared

\sum_{0}^{x^3\,dx^3\,dx^


In [None]:
# Another example
text_sample = 'r of x is the sum from 0 to x of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams=3, early_stopping=True, temperature=0.7, eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


English to LaTeX

English: r of x is the sum from 0 to x of x squared
LaTeX: r(x) = x^2\,dx^2\,dx^2\,


In [None]:
# Sanity check that a non-finetuned model could not have done this
non_finetuned_latex_generator = pipeline(
    'text-generation',
    model=AutoModelForCausalLM.from_pretrained('gpt2'),  # not fine-tuned!
    tokenizer=tokenizer
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_vers

In [None]:
few_shot_prompt = """English to LaTeX

English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx \
###
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx \
###
English: x squared
LaTeX:"""

In [None]:
print(non_finetuned_latex_generator(
    few_shot_prompt, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(few_shot_prompt)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


English to LaTeX

English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx ###
English: x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###


In [None]:
print(non_finetuned_latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


English to LaTeX

English: r of x is the sum from 0 to x of x squared
LaTeX: r of x is the sum from 0 to x of x squared

LaTeX: r of
