In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 7.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 4.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 66.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.

In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

# GPT2 with Fine Tuning

### Prepare data

In [None]:
lyrics = pd.read_csv('/content/drive/MyDrive/sandip_GPT2/sandip_GPT/data/lyrics-data.csv')

In [None]:
lyrics.head()

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt


In [None]:
lyrics = lyrics[lyrics['language']=='en']

In [None]:
#Only keep popular artists, with genre Rock/Pop and popularity high enough
artists = pd.read_csv('/content/drive/MyDrive/sandip_GPT2/sandip_GPT/data/artists-data.csv')


In [None]:
artists.head()

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé; Romântico; Reggae,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop; Axé; Romântico,167.0,1.5,/claudia-leitte/


In [None]:
artists = artists[(artists['Genres'].isin(['Rock'])) & (artists['Popularity']>5)]

In [None]:
df = lyrics.merge(artists[['Artist', 'Genres', 'Link']], left_on='ALink', right_on='Link', how='inner')

In [None]:
df = df.drop(columns=['ALink','SLink','language','Link'])

In [None]:
#Drop the songs with lyrics too long (after more than 1024 tokens, does not work)
df = df[df['Lyric'].apply(lambda x: len(x.split(' ')) < 350)]

In [None]:
len(df)

905

In [None]:
#Create a very small test set to compare generated text with the reality
test_set = df.sample(n = 50)
df = df.loc[~df.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
df = df.reset_index()

In [None]:
#For the test set only, keep last 20 words in a new column, then remove them from original column
test_set['True_end_lyrics'] = test_set['Lyric'].str.split().str[-20:].apply(' '.join)
test_set['Lyric'] = test_set['Lyric'].str.split().str[:-20].apply(' '.join)

In [None]:
test_set.head()

Unnamed: 0,index,SName,Lyric,Artist,Genres,True_end_lyrics
0,697,Look To The Land,"Well, I dreamed I was a gypsy riding the land ...",Bruce Springsteen,Rock,the lovers for they are the key keep an eye on...
1,1,Spaceman,Starry night bring me down Till I realize the ...,4 Non Blondes,Rock,looking for a brand new place I remember livin...
2,929,Whitetown (Version 1 - unreleased),It's midnight down in sector two Little girl w...,Bruce Springsteen,Rock,that they will turn a man Into a stranger in h...
3,216,Julie,"Julie Oh And I know Like a mirror, Sunday morn...",David Bowie,Rock,me All the days of your life Julie I'm lost ov...
4,123,Dirty Boys,Something like Tobacco Road Living on a lonely...,David Bowie,Rock,the die is cast When the die is cast and you h...


In [None]:
len(test_set)

50

### Prepare the dataset

In [None]:
class SongLyrics(Dataset):
    
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in df['Lyric']:
          self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
                
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

In [None]:
dataset = SongLyrics(df['Lyric'], truncate=True, gpt2_type="gpt2")

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

### Prepare training

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

In [None]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [None]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=20, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

### Actual Training

In [None]:
#Train the model on the specific data we have
model = train(dataset, model, tokenizer)



Training epoch 0
0


855it [01:13, 11.58it/s]


Training epoch 1
tensor(3.0334, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.65it/s]


Training epoch 2
tensor(2.6516, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.60it/s]


Training epoch 3
tensor(1.9720, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.63it/s]


Training epoch 4
tensor(1.3229, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.65it/s]


Training epoch 5
tensor(1.1505, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.66it/s]


Training epoch 6
tensor(0.9090, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.70it/s]


Training epoch 7
tensor(1.7401, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.64it/s]


Training epoch 8
tensor(1.3987, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.60it/s]


Training epoch 9
tensor(0.5757, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.67it/s]


Training epoch 10
tensor(1.8748, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.59it/s]


Training epoch 11
tensor(0.7496, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.64it/s]


Training epoch 12
tensor(1.5520, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:12, 11.73it/s]


Training epoch 13
tensor(2.1920, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.56it/s]


Training epoch 14
tensor(1.6413, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.66it/s]


Training epoch 15
tensor(1.4457, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.59it/s]


Training epoch 16
tensor(1.5487, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.61it/s]


Training epoch 17
tensor(1.6838, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.67it/s]


Training epoch 18
tensor(1.2469, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:13, 11.61it/s]


Training epoch 19
tensor(1.2225, device='cuda:0', grad_fn=<NllLossBackward0>)


855it [01:14, 11.53it/s]


In [None]:
#Save the model to a pkl or something so it can be reused later on
torch.save(model, '/content/drive/MyDrive/sandip_GPT2/sandip_GPT/output/model.pt')