# Finetuning GPT on Chilean Spanish

This is an exercise in using Huggingface to finetune GPT.

In [2]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [6]:
chileno_data = TextDataset(
    tokenizer=tokenizer,
    file_path='clean_chilean_reddit.txt',  # Chilean reddit
    block_size=32  # length of each chunk of text to use as a datapoint
)

In [7]:
chileno_data[0], chileno_data[0].shape

(tensor([30562,   660,  7252,  8045,    11, 12940,   660,  7252,   283,  2049,
           257,   555, 49027, 21872, 12520,   236,   114,   198, 15681,   284,
          1326,   362, 19643,   320,  7063,   844,   295,   274,   269,  4763,
           807,  3076]),
 torch.Size([32]))

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

In [9]:
collator_example = data_collator([tokenizer('Yo soy datos'), tokenizer('También soy datos')])

collator_example

{'input_ids': tensor([[38101, 17797,  4818,   418, 50257, 50257, 50257],
        [   51,  4131,    72, 35942, 17797,  4818,   418]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[38101, 17797,  4818,   418,  -100,  -100,  -100],
        [   51,  4131,    72, 35942, 17797,  4818,   418]])}

In [None]:
model_checkpoint = 'gpt2'

model = GPT2LMHeadModel.from_pretrained(model_checkpoint)  # load up a GPT2 model

pretrained_generator = pipeline(
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

In [None]:
training_args = TrainingArguments(
    output_dir="caffsean/chilenoGPT", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=len(chileno_data.examples) // 5, # number of warmup steps for learning rate scheduler,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    hub_token='',
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chileno_data.examples[:int(len(chileno_data.examples)*.8)],
    eval_dataset=chileno_data.examples[int(len(chileno_data.examples)*.8):]
)

trainer.evaluate()

In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,4.4985,4.310601
2,4.1063,3.979791
3,3.8797,3.788628
4,3.7554,3.664512
5,3.616,3.579157
6,3.534,3.515174
7,3.4631,3.463155
8,3.3867,3.432958
9,3.2781,3.397545


Epoch,Training Loss,Validation Loss
1,4.4985,4.310601
2,4.1063,3.979791
3,3.8797,3.788628
4,3.7554,3.664512
5,3.616,3.579157
6,3.534,3.515174
7,3.4631,3.463155
8,3.3867,3.432958
9,3.2781,3.397545
10,3.2074,3.392071


TrainOutput(global_step=38020, training_loss=3.7507751234075637, metrics={'train_runtime': 12042.3252, 'train_samples_per_second': 101.025, 'train_steps_per_second': 3.157, 'total_flos': 1.986766626816e+16, 'train_loss': 3.7507751234075637, 'epoch': 10.0})

In [None]:
trainer.push_to_hub()