In [1]:
# Import packages

import pandas as pd
from transformers import TrainingArguments, Trainer, GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling
from datasets import load_dataset, load_from_disk

In [2]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
tokenizer.pad_token = tokenizer.eos_token

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2-large', output_hidden_states=True, use_cache=False)
model.train()

print('Model successfully loaded!')

Model successfully loaded!


In [3]:
# Prepare/load finetuning dataset

try:
    # See if tokenized dataset already exists
    tokenized_dataset = load_from_disk('./tokenized_dataset')
    print('Loading tokenized datset from disk...')

except:
    # Download dataset
    print('Downloading dataset...')
    dataset = load_dataset('csv', data_files='./lm_data.csv', split='train')
    
    # Tokenize dataset
    print('Tokenizing dataset...')
    max_length = 512
    tokenized_dataset=dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, max_length=max_length, padding='max_length'), batched=True)
    tokenized_dataset.save_to_disk('./tokenized_dataset')
    
# Store tokenized dataset
train_dataset = tokenized_dataset

print('Dataset successfully loaded!')

Loading tokenized datset from disk...
Dataset successfully loaded!


In [None]:
# Define data collator and training arguments

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer,
                                                mlm = False)

training_args = TrainingArguments(output_dir='./model_checkpoints',
                                  per_device_train_batch_size=1,
                                  save_total_limit = 2,
                                  num_train_epochs=1,
                                  gradient_checkpointing=True)

print('Training parameters set!')

In [None]:
# Fine-tune the model

# Define the trainer class
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  data_collator=data_collator)

# Run training and save the model
trainer.train()

trainer.save_model(output_dir='./finetuned_model')

In [None]:
# Make sure new model loads in correctly

new_model = GPT2LMHeadModel.from_pretrained('./finetuned_model', output_hidden_states=True)
print(new_model)