<a href="https://colab.research.google.com/github/sudhang/css-nlp/blob/master/gptneo/GPT_Neo_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Source https://medium.com/geekculture/fine-tune-eleutherai-gpt-neo-to-generate-netflix-movie-descriptions-in-only-47-lines-of-code-40c9b4c32475

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate



In [None]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import torch
import pandas as pd
import numpy as np

from datasets import Dataset

# Download and prepare the GPT NEO Model

In [None]:
# Set the random seed to a fixed value to get reproducible results
torch.manual_seed(42)
# Download the pre-trained GPT-Neo model's tokenizer
# Add the custom tokens denoting the beginning and the end
# of the sequence and a special token for padding
tokenizer = GPT2Tokenizer.from_pretrained(
    "EleutherAI/gpt-neo-1.3B",
    bos_token='<|startoftext|>',
    eos_token='<|endoftext|>',
    pad_token='<|pad|>'
    )

# Download the pre-trained GPT-Neo model and transfer it to the GPU
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").cuda()

# Resize the token embeddings because we've just added 3 new tokens
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 2048)

# Data

In [None]:
GDRIVEPATH = "/content/drive/MyDrive/TU/Sem 4/NLP"

In [None]:
df = pd.read_csv(f"{GDRIVEPATH}/data/nyt_train.csv")

df = df[["content"]]
display(df)

Unnamed: 0,content
0,"WASHINGTON — Three years ago, President Barack..."
1,When Honda Motor Company said two months ago t...
2,WASHINGTON — Four months after a historic acco...
3,WASHINGTON — Within hours of opening an invest...
4,"WASHINGTON — The F.B.I. director, James B. Com..."
...,...
8580,Senator Marco Rubio of Florida won the Puerto ...
8581,"EXETER, N.H. — Eight years after aggressively ..."
8582,"PRINCETON, Ore. — The armed occupation of a wi..."
8583,"WASHINGTON — As North Korea’s reclusive ruler,..."


In [None]:
descriptions = df["content"]
max_length = max([len(tokenizer.encode(description)) for description in descriptions])
max_length = min(512, max_length)     # Truncating to reduce gpu mem usage (blunt instrument)
max_length

Token indices sequence length is longer than the specified maximum sequence length for this model (3240 > 2048). Running this sequence through the model will result in indexing errors


512

In [None]:
display(descriptions)
max_length

0       WASHINGTON — Three years ago, President Barack...
1       When Honda Motor Company said two months ago t...
2       WASHINGTON — Four months after a historic acco...
3       WASHINGTON — Within hours of opening an invest...
4       WASHINGTON — The F.B.I. director, James B. Com...
                              ...                        
8580    Senator Marco Rubio of Florida won the Puerto ...
8581    EXETER, N.H. — Eight years after aggressively ...
8582    PRINCETON, Ore. — The armed occupation of a wi...
8583    WASHINGTON — As North Korea’s reclusive ruler,...
8584    The man entered the Red Robin restaurant insid...
Name: content, Length: 8585, dtype: object

512

In [None]:
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            # Encode the descriptions using the GPT-Neo tokenizer
            encodings_dict = tokenizer('<|startoftext|>'
                                        + txt +
                                        '<|endoftext|>',
                                        truncation=True,
                                        max_length=max_length,
                                        padding="max_length")
            input_ids = torch.tensor(encodings_dict["input_ids"])
            self.input_ids.append(input_ids)
            mask = torch.tensor(encodings_dict["attention_mask"])
            self.attn_masks.append(mask)
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = NetflixDataset(descriptions, tokenizer, max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = torch.utils.data.random_split(
      dataset,
      [train_size, len(dataset) - train_size]
    )

In [None]:
len(dataset)

8585

# Set up the trainer

In [None]:
# Here I will pass the output directory where
# the model predictions and checkpoints will be stored,
# batch sizes for the training and validation steps,
# and warmup_steps to gradually increase the learning rate
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/TU/Sem 4/NLP/results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    max_steps=300,
    weight_decay=0.05,                        # try to avoid overfitting
    logging_dir='/content/drive/MyDrive/TU/Sem 4/NLP/logs',
    logging_steps=10,
)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=16, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=16, num_workers=4)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=lambda data:
    {'input_ids': torch.stack([f[0] for f in data]),
     'attention_mask': torch.stack([f[1] for f in data]),
     'labels': torch.stack([f[0] for f in data])}
)

# Start training process!
trainer.train()



Step,Training Loss
10,10.0221
20,7.9002
30,5.1398
40,3.1893
50,2.4177
60,2.3951
70,2.3699
80,2.4892
90,2.4258
100,2.484


TrainOutput(global_step=300, training_loss=3.0097874196370444, metrics={'train_runtime': 335.873, 'train_samples_per_second': 3.573, 'train_steps_per_second': 0.893, 'total_flos': 4454849131315200.0, 'train_loss': 3.0097874196370444, 'epoch': 0.16})

# Save the model

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
finetuned_model_id = "gptneo_cssnlp"

# Push the model to the Hugging Face Hub
model.push_to_hub(f"sudhangshankar/{finetuned_model_id}", use_auth_token=True)

pytorch_model.bin:   0%|          | 0.00/5.26G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sudhangshankar/gptneo_cssnlp/commit/217b63ddfc850e54bf26524eaa9a2902400b0eeb', commit_message='Upload GPTNeoForCausalLM', commit_description='', oid='217b63ddfc850e54bf26524eaa9a2902400b0eeb', pr_url=None, pr_revision=None, pr_num=None)