# Fine-tuning GPT-2
## Write a conclusion given the title

## Train

In [1]:
# import libraries

!pip install transformers
from transformers import AutoTokenizer
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead
from transformers import pipeline
import csv
import re
import pandas as pd
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [2]:
# instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2',sep_token = '<|sep|>',truncation=True, max_length=600000)
tokenizer.model_max_length

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1024

In [4]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

# data loading (data already preprocessed by adding special tokens)
train_dataset,test_dataset,data_collator = load_dataset('train_gpt2_conclusion.txt','val_gpt2_conclusion.txt', tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (1647075 > 1024). Running this sequence through the model will result in indexing errors


In [5]:
# istantiate the model (gpt-2 small with 12-layer, 768-hidden, 12-heads, 117M parameters)
model = AutoModelWithLMHead.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))



Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50258, 768)

In [6]:
training_args = TrainingArguments(
    output_dir="./model_parameters",  # define the output directory
    overwrite_output_dir=True,        # True if we want to overwrite the content of the output directory
    num_train_epochs=6,               # number of training epochs
    per_device_train_batch_size=16,   # batch size for training
    per_device_eval_batch_size=16,    # batch size for evaluation
    logging_strategy="steps",
    logging_steps=400,
    eval_steps=200,                   # number of update steps between two evaluations
    save_steps=600,                   # number of update steps before saving the update model 
    warmup_steps=200,                 # number of warmup steps for learning rate scheduler
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [7]:
import torch

# move the computation on the gpu, if available 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = model.to(device)

cuda


In [None]:
trainer.train()

***** Running training *****
  Num examples = 12867
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4830
  Number of trainable parameters = 124440576


Step,Training Loss
400,9.3446
800,3.7025
1200,3.556
1600,3.5096
2000,3.4055
2400,3.3964
2800,3.3145


Saving model checkpoint to ./model_parameters/checkpoint-600
Configuration saved in ./model_parameters/checkpoint-600/config.json
Configuration saved in ./model_parameters/checkpoint-600/generation_config.json
Model weights saved in ./model_parameters/checkpoint-600/pytorch_model.bin
Saving model checkpoint to ./model_parameters/checkpoint-1200
Configuration saved in ./model_parameters/checkpoint-1200/config.json
Configuration saved in ./model_parameters/checkpoint-1200/generation_config.json
Model weights saved in ./model_parameters/checkpoint-1200/pytorch_model.bin
Saving model checkpoint to ./model_parameters/checkpoint-1800
Configuration saved in ./model_parameters/checkpoint-1800/config.json
Configuration saved in ./model_parameters/checkpoint-1800/generation_config.json
Model weights saved in ./model_parameters/checkpoint-1800/pytorch_model.bin
Saving model checkpoint to ./model_parameters/checkpoint-2400
Configuration saved in ./model_parameters/checkpoint-2400/config.json
Confi

In [None]:
# Save
model_path = "./model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

## Generate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load
model_path = "./model"
model = AutoModelWithLMHead.from_pretrained(model_path)

In [None]:
# Generate ans store new data
import pandas as pd
generator = pipeline('text-generation', model=model_path, tokenizer='gpt2', device=0)

def generate_conclusion(text):
    # generate the content
    t = generator(text, do_sample=True, max_length=512, top_k=50, top_p=0.50, num_return_sequences=1)[0]['generated_text']
    # remove the input string 
    t = t[len(text):]
    # search the first upper case letter and delete what was generate before it
    m = re.search("[A-Z]", t)
    if m is not None:
      m = m.start()
      t = t[m:]
    # remove \n and add . if necessary
    t = t.replace('\n', ' ').rsplit('. ', 1)[0] + '.'
    # remove additional parts generated
    if t.find('<|sep|>'):
      x = t.find('<|sep|>')
      t = t[:x].rsplit('. ', 1)[0] + '.'
    return t

titles = pd.read_csv('./gpt2.csv')['title']

# create the output file
with open("./gpt2_from_title_to_conclusion.csv",'w') as csvfile:
  # write the header
  writer = csv.DictWriter(csvfile, fieldnames=['title', 'conclusion'])
  writer.writeheader()
  # generate the content for each title and append it to the output file
  for elem in titles:
    writer.writerow({"title": elem,'conclusion': generate_conclusion(str(elem + '<|sep|>'))})