# Generating Song Lyrics

In [1]:
import pandas as pd
import numpy as np

from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
    GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel 
    
)

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler



In [2]:
set_seed(42)

configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
device = torch.device("cuda")
model.cuda()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model.resize_token_embeddings(len(tokenizer))

# Alternative: 
# tokenizer = AutoTokenizer.from_pretrained('gpt2')
# special_tokens_dict = {
#         'bos_token': '<BOS>',
#         'eos_token': '<EOS>',
#         'pad_token': '<PAD>',
#         'unk_token': '<UNK>',
#         'mask_token': '<MASK>'
#     }
# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

In [4]:
class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx]

In [5]:
df = pd.read_csv('data/nlpia_lines.csv')
df = df[df['is_text']]
lines = df.line_text.copy() 

In [6]:
dataset = GPT2Dataset(lines, tokenizer, max_length=768)

# Split into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])



In [7]:
training_args = TrainingArguments(
    output_dir='./model02_all_huggingface_results',
    per_device_train_batch_size=5,
    num_train_epochs=5,
    save_strategy='epoch'
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


trainer = Trainer(
        model,
        training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )


In [8]:
training_args.per_device_train_batch_size


5

In [9]:
import os
os.environ["WANDB_DISABLED"] = "true"
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 5632
  Num Epochs = 5
  Instantaneous batch size per device = 5
  Total train batch size (w. parallel, distributed & accumulation) = 5
  Gradient Accumulation steps = 1
  Total optimization steps = 5635
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
500,4.5832
1000,3.4919
1500,3.1078
2000,3.0209
2500,2.8683
3000,2.7384
3500,2.6642
4000,2.5168
4500,2.5297
5000,2.3891


Saving model checkpoint to ./model02_all_huggingface_results/checkpoint-1127
Configuration saved in ./model02_all_huggingface_results/checkpoint-1127/config.json
Model weights saved in ./model02_all_huggingface_results/checkpoint-1127/pytorch_model.bin
Saving model checkpoint to ./model02_all_huggingface_results/checkpoint-2254
Configuration saved in ./model02_all_huggingface_results/checkpoint-2254/config.json
Model weights saved in ./model02_all_huggingface_results/checkpoint-2254/pytorch_model.bin
Saving model checkpoint to ./model02_all_huggingface_results/checkpoint-3381
Configuration saved in ./model02_all_huggingface_results/checkpoint-3381/config.json
Model weights saved in ./model02_all_huggingface_results/checkpoint-3381/pytorch_model.bin
Saving model checkpoint to ./model02_all_huggingface_results/checkpoint-4508
Configuration saved in ./model02_all_huggingface_results/checkpoint-4508/config.json
Model weights saved in ./model02_all_huggingface_results/checkpoint-4508/pytorc