In [17]:
!pip install transformers



In [18]:
import numpy as np
import pandas as pd
import random,os
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import csv

In [19]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
path = "/content/drive/My Drive/"

In [21]:
# Imported the excel data file from the drive

data = pd.read_excel("/content/Backstreet_Boys_Lyrics_score.xlsx")

# Stored the title and the lyrics of the song

data = data[['Title','Lyrics']]

#print(data.shape)




In [22]:
class Generating_data(Dataset):  
    def __init__(self, cntrl_code, type_of_gpt2="gpt2", maximum_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(type_of_gpt2)
        #declaring the array for the lyrics
        self.song_lyrics = []

        #Here converting the each lyric into the tensor
        for row in data['Lyrics']:
          self.song_lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{cntrl_code}|>{row[:maximum_length]}<|endoftext|>")
            ))               
      
      #storing the length of the song_lyrics in song_lyrics_count
        self.song_lyrics_count = len(self.song_lyrics)
        
    def __len__(self):
        return self.song_lyrics_count

    def __getitem__(self, item):
        return self.song_lyrics[item]
    
dataset = Generating_data(data['Lyrics'], type_of_gpt2="gpt2")  

In [23]:
#Importing the GPT-2 Model
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')

#Importing the GPT-2 tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


def pack_tensor(new_tensor, packed_tensor, maximum_seq_length):
    if packed_tensor is None:
        return new_tensor, True, None

    if new_tensor.size()[1] + packed_tensor.size()[1] > maximum_seq_length:
        return packed_tensor, False, new_tensor
        
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [24]:
#training function 
def train(dataset, gpt_model, gpt_tokenizer, batch_size=36, no_of_epochs=15, learning_rate=1e-4, maximum_seq_length=400, warmup_steps=250, type_of_gpt2="gpt2", output_directory=".", output_prefix="wreckgar", test_mode=False):
    acc_steps = 100
    device = torch.device("cuda")
    gpt_model = gpt_model.cuda()
    gpt_model.train()

    #AdamW optimizer is used
    optimizer = AdamW(gpt_model.parameters(), lr=learning_rate)


    #schedules
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1)

    #Loading the training data and storing it into the train_dataloader
    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    #Initial values for loss, acc_batch_count are 0 & input_tensor set None
    loss=0
    acc_batch_count = 0
    input_tensor = None

    for epoch in range(no_of_epochs):

        #printing the training epoch
        print(f"Training epoch {epoch}")
        #printing the loss value
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = gpt_model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (acc_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                gpt_model.zero_grad()

            acc_batch_count =acc_batch_count + 1
            input_tensor = None
        
    return gpt_model

In [25]:
gpt_model = train(dataset, gpt_model, gpt_tokenizer)



Training epoch 0
0


109it [00:09, 11.84it/s]


Training epoch 1
tensor(3.0787, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.68it/s]


Training epoch 2
tensor(3.1025, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.65it/s]


Training epoch 3
tensor(2.8402, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.41it/s]


Training epoch 4
tensor(3.1287, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.29it/s]


Training epoch 5
tensor(3.1741, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.43it/s]


Training epoch 6
tensor(2.6942, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.64it/s]


Training epoch 7
tensor(2.4608, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.66it/s]


Training epoch 8
tensor(3.0630, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.72it/s]


Training epoch 9
tensor(2.6722, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.70it/s]


Training epoch 10
tensor(2.8003, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.75it/s]


Training epoch 11
tensor(2.9942, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.60it/s]


Training epoch 12
tensor(2.4116, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.67it/s]


Training epoch 13
tensor(2.7856, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.56it/s]


Training epoch 14
tensor(2.4670, device='cuda:0', grad_fn=<NllLossBackward0>)


109it [00:09, 11.61it/s]


In [26]:
#Here is the function which generates the lyrics based on the given input
#generate function will multiple input arguments like: gpt_model, gpt_tokenizer, prompt...and so on

def generate(gpt_model, gpt_tokenizer, prompt, entry_count=10, entry_length=20, top_p=0.8, temperature=1):
    gpt_model.eval()
    #initial values
    genertd_num = 0
    genertd_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            genertd = torch.tensor(gpt_tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = gpt_model(genertd, labels=genertd)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                genertd= torch.cat((genertd, next_token), dim=1)

                if next_token in gpt_tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:
                  #incrementing the generated_num by 1
                    genertd_num = genertd_num + 1
                  #append output_text into the generated_list
                    output_list = list(genertd.squeeze().numpy())
                    output_text = gpt_tokenizer.decode(output_list)
                    genertd_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(genertd.squeeze().numpy())
              output_text = gpt_tokenizer.decode(output_list)
              genertd_list.append(output_text)
                
    return genertd_list

song_lyrics = []

#Here running the function multiple times to generate the multiple lyrics for the same given input 

for i in range(10):
  song_lyrics.append(generate(gpt_model.to('cpu'), gpt_tokenizer, "I love deep learning", entry_count=1))


100%|██████████| 1/1 [00:02<00:00,  2.44s/it]
100%|██████████| 1/1 [00:02<00:00,  2.46s/it]
100%|██████████| 1/1 [00:02<00:00,  2.44s/it]
100%|██████████| 1/1 [00:02<00:00,  2.51s/it]
100%|██████████| 1/1 [00:02<00:00,  2.43s/it]
100%|██████████| 1/1 [00:02<00:00,  2.46s/it]
100%|██████████| 1/1 [00:02<00:00,  2.45s/it]
100%|██████████| 1/1 [00:02<00:00,  2.45s/it]
100%|██████████| 1/1 [00:02<00:00,  2.45s/it]
100%|██████████| 1/1 [00:02<00:00,  2.41s/it]


In [27]:
# Here printing the  generated lyrics

for lyric in song_lyrics:
    print(lyric)

['I love deep learning for both scientific and social science. I work for @Googlego for a couple years and live in']
["I love deep learning. It's my whole body, I always love it. I always love it. My mind goes"]
["I love deep learning, so I think it's important to understand it, but not what you do.\n\nFirst"]
['I love deep learning.\n\nQ: Why are you saying this to you?\n\nA: The magic number']
['I love deep learning and are used to working with 3rd party tools such as Caffe, SQL, etc. It']
["I love deep learning, but I know that will be a challenge. I think it's time to do something about it"]
["I love deep learning and Machine Learning so much!\n\nAnd lastly, I'd like to bring a few words"]
['I love deep learning and want to help you to make better decisions for yourself and your family.\n\nBest Practices for']
['I love deep learning, so I had to start by asking you about the one and only approach for training deep learning that']
["I love deep learning in general. I've been working

In [28]:
#Here stores the generated lyrics in the file.

f = open("generated_lyrics_out.txt",'w')
#saving the generated lyrics in the generated_lyrics_out.txt file

for lyric in song_lyrics:
    f.write(lyric[0])
    f.write("\n\n")