# Fine-tuning GPT-2
## Write an introduction given the title

## Train

In [1]:
# import libraries

!pip install transformers
import pandas as pd
import numpy as np
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead
import csv
import re


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2',sep_token = '<|sep|>',truncation=True, max_length=600000)
tokenizer.model_max_length

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1024

In [3]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    val_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,val_dataset,data_collator

# data loading (data already preprocessed by adding special tokens)
train_dataset,val_dataset,data_collator = load_dataset('train_gpt2_introduction.txt','val_gpt2_introduction.txt', tokenizer)



In [4]:
# istantiate the model (gpt-2 small with 12-layer, 768-hidden, 12-heads, 117M parameters)
model = AutoModelWithLMHead.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))



Embedding(50258, 768)

In [5]:
training_args = TrainingArguments(
    output_dir="./model_parameters",  # define the output directory
    overwrite_output_dir=True,        # True if we want to overwrite the content of the output directory
    num_train_epochs=6,               # number of training epochs
    per_device_train_batch_size=16,   # batch size for training
    per_device_eval_batch_size=16,    # batch size for evaluation
    logging_strategy="steps",
    logging_steps=400,
    eval_steps=200,                   # number of update steps between two evaluations
    save_steps=600,                   # number of update steps before saving the update model 
    warmup_steps=200,                 # number of warmup steps for learning rate scheduler
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [6]:
import torch

# move the computation on the gpu, if available 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = model.to(device)

cuda


In [7]:
trainer.train()

***** Running training *****
  Num examples = 3670
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1380
  Number of trainable parameters = 124440576


Step,Training Loss
400,9.5273
800,3.5191
1200,3.3565


Saving model checkpoint to ./model_parameters/checkpoint-600
Configuration saved in ./model_parameters/checkpoint-600/config.json
Configuration saved in ./model_parameters/checkpoint-600/generation_config.json
Model weights saved in ./model_parameters/checkpoint-600/pytorch_model.bin
Saving model checkpoint to ./model_parameters/checkpoint-1200
Configuration saved in ./model_parameters/checkpoint-1200/config.json
Configuration saved in ./model_parameters/checkpoint-1200/generation_config.json
Model weights saved in ./model_parameters/checkpoint-1200/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1380, training_loss=5.184949968863225, metrics={'train_runtime': 788.203, 'train_samples_per_second': 27.937, 'train_steps_per_second': 1.751, 'total_flos': 1438412636160000.0, 'train_loss': 5.184949968863225, 'epoch': 6.0})

In [8]:
# Save
model_path = "./model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Configuration saved in ./model/config.json
Configuration saved in ./model/generation_config.json
Model weights saved in ./model/pytorch_model.bin
tokenizer config file saved in ./model/tokenizer_config.json
Special tokens file saved in ./model/special_tokens_map.json


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.json',
 './model/merges.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')

## Generate

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# Load
model_path = "./model"
model = AutoModelWithLMHead.from_pretrained(model_path)

loading configuration file ./model/config.json
Model config GPT2Config {
  "_name_or_path": "./model",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 50258
}

loading weights fi

In [None]:
# Generate ans store new data
import pandas as pd
generator = pipeline('text-generation', model=model_path, tokenizer='gpt2', device=0)

def generate_introduction(text):
    # generate the content
    t = generator(text, do_sample=True, max_length=512, top_k=50, top_p=0.50, num_return_sequences=1)[0]['generated_text']
    # remove the input string 
    t = t[len(text):]
    # search the first upper case letter and delete what was generate before it
    m = re.search("[A-Z]", t)
    if m is not None:
      m = m.start()
      t = t[m:]
    # remove \n and add . if necessary
    t = t.replace('\n', ' ').rsplit('. ', 1)[0] + '.'
    # remove additional parts generated
    if t.find('<|sep|>'):
      x = t.find('<|sep|>')
      t = t[:x].rsplit('. ', 1)[0] + '.'
    return t

titles = pd.read_csv('./gpt2.csv')['title']

# create the output file
with open("./gpt2_from_title_to_introduction.csv",'w') as csvfile:
  # write the header
  writer = csv.DictWriter(csvfile, fieldnames=['title', 'introduction'])
  writer.writeheader()
  # generate the content for each title and append it to the output file
  for elem in titles:
    writer.writerow({"title": elem,'introduction': generate_introduction(str(elem + '<|sep|>'))})

loading configuration file ./model/config.json
Model config GPT2Config {
  "_name_or_path": "./model",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 50258
}

loading configurat