<a href="https://colab.research.google.com/github/prathamesh0902/NLP/blob/main/Project/Lyrics_Generator_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementation of LSTM in Pytorch for Text generation

## Import Data

In [None]:
import pandas as pd
import numpy as np

# importing the data sets
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
filename_array = []
source_folder = '/content/drive/MyDrive/Data_Files/Lyrics_Data/'
for i in range(10, 75):
  source_file = source_folder + str(i) # + '.txt'
  filename_array.append(source_file)

In [None]:
import re

# Load and preprocess the lyrics data

filenames = filename_array # ['/content/drive/MyDrive/Data_Files/lyrics005.txt', '/content/drive/MyDrive/Data_Files/lyrics004.txt']
with open('/content/drive/MyDrive/Data_Files/lyrics007.txt', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            outfile.write(infile.read())

with open('/content/drive/MyDrive/Data_Files/lyrics007.txt', 'r') as f:
    file_raw = f.read()

BAD_SYMBOLS_RE = re.compile('[/(){}\[\]\|@,;_]')
REPLACE_BY_SPACE_RE = re.compile('[^0-9a-z #+]')

def text_prepare(text):
    
    text = text.lower() # lowercase text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    
    return text

lyrics = text_prepare(file_raw)

## Bleu implementation

In [None]:
BAD_SYMBOLS_RE = re.compile('[/(){}\[\]\|@,;_\’.]')
REPLACE_BY_DOT_RE = re.compile('[^0-9a-z #+]') 
# REPLACE_BY_SPACE_RE = re.compile('[^0-9a-z #+]') 

def text_prepare2(text):
    
    text = text.lower() # lowercase text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = REPLACE_BY_DOT_RE.sub('.', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    
    return text

lyrics2 = text_prepare2(file_raw)

In [None]:
lyrics_array = list(set(lyrics2.split(".")))

In [None]:
from tqdm import tqdm
for j in tqdm(range(len(lyrics_array))):
    lyrics_array[j] = lyrics_array[j].split(" ")

100%|██████████| 1986/1986 [00:00<00:00, 327922.52it/s]


In [None]:
class SmoothingFunction:
    def __init__(self, epsilon=0.1, alpha=5, k=5):
        self.epsilon = epsilon
        self.alpha = alpha
        self.k = k

    def method1(self, p_n, *args, **kwargs):
        """
        Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
        """
        return [
            (p_i.numerator + self.epsilon) / p_i.denominator
            if p_i.numerator == 0
            else p_i
            for p_i in p_n
        ]

chencherry = SmoothingFunction()
reference= lyrics_array

## Model training and generation

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
import tensorflow as tf
import datetime
# Clear any logs from previous runs
!rm -rf ./logs/ 

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [None]:
import torch
import torch.nn as nn
from collections import Counter
from nltk.tokenize import word_tokenize
import string
import random
import sys
from torch.utils.tensorboard import SummaryWriter
import pickle
from nltk.translate.bleu_score import sentence_bleu
# writer = SummaryWriter(log_dir='logs/Tensorboard_logs'+ datetime.datetime.now().strftime("%m%d"))

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("=> Using",device, "device")

# Convert the input data into a tensor of character indices
all_characters = string.printable
n_characters = len(all_characters)


# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTMModel, self).__init__()
        # self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # self.output_size = output_size
        self.embed = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        out = self.embed(x)
        out, (hidden, cell) = self.lstm(out.unsqueeze(1), (hidden, cell))
        out = self.fc(out.reshape(out.shape[0],-1))
        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        # print("batch_size : ", batch_size)
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return hidden, cell

class Generator():
    def __init__(self, initial_str, prediction_len):
        self.chunk_len = 250
        self.num_epochs = 1000 #000
        self.batch_size = 1
        self.print_every = 50 #500
        self.hidden_size = 128
        self.num_layers = 2
        self.learning_rate = 0.003
        self.initial_str = initial_str
        self.prediction_len = prediction_len
        self.model = LSTMModel(n_characters, self.hidden_size, n_characters, self.num_layers).to(device)
    
    def char_tensor(self, string):
        tensor = torch.zeros(len(string)).long()
        # print(string)
        for c in range(len(string)):
            tensor[c] = all_characters.index(string[c])
        return tensor

    def get_random_batch(self):
        start_idx = random.randint(0, len(lyrics) - self.chunk_len)
        end_idx = start_idx + self.chunk_len + 1
        text_str = lyrics[start_idx:end_idx]
        text_input = torch.zeros(self.batch_size, self.chunk_len)
        text_target = torch.zeros(self.batch_size, self.chunk_len)

        for i in range(self.batch_size):
            text_input[i,:] = self.char_tensor(text_str[:-1])
            text_target[i,:] = self.char_tensor(text_str[1:])

        return text_input.long(), text_target.long()



    def generate(self, temperature = 0.65):
        with open('/content/drive/MyDrive/Data_Files/LSTMmodel.pkl', 'rb') as f:
            loaded_model = pickle.load(f)
        #Later to restore:
        modelSD = loaded_model
        modelSD.load_state_dict(torch.load('/content/drive/MyDrive/Data_Files/LSTMmodelSD'))
        hidden, cell = modelSD.init_hidden(batch_size = self.batch_size)  #self.model #loaded_model
        initial_input  = self.char_tensor(initial_str)
        predicted = initial_str

        for p in range(len(initial_str) -1):
            _, (hidden, cell) = self.model(initial_input[p].view(1).to(device), hidden, cell)

        last_char = initial_input[-1]

        for p in range(prediction_len):
            output , (hidden, cell) = self.model(last_char.view(1).to(device), hidden, cell)
            output_dist = output.data.view(-1).div(temperature).exp()
            top_char = torch.multinomial(output_dist, 1)[0]
            predicted_char = all_characters[top_char]
            predicted += predicted_char
            last_char = self.char_tensor(predicted_char)

        print("=> generated Lyrics :" , predicted)

        # evaluating the bleu score
        train_text_prepare = text_prepare2(predicted)
        candidate = train_text_prepare.split(" ")
        print('Cumulative 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1))
        print('Cumulative 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1))
        print('Cumulative 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0), smoothing_function=chencherry.method1))
        print('Cumulative 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1)) 

        # return predicted 

    def train(self):
        # Instantiate the model, loss function, and optimizer
        #input_size, hidden_size, output_size, num_layers
        # self.model = LSTMModel(n_characters, self.hidden_size, n_characters, self.num_layers).to(device)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        criterion = nn.CrossEntropyLoss()
        writer = SummaryWriter(log_dir='logs')

        print("=> Starting training")

        # Train the model
        for epoch in range(1,self.num_epochs+1):   #tqdm()
            inp, target = self.get_random_batch()
            hidden, cell = self.model.init_hidden(batch_size = self.batch_size)
            self.model.zero_grad()
            loss = 0
            inp = inp.to(device)
            target = target.to(device)

            for c in range(self.chunk_len):
                outputs, (hidden, cell) = self.model(inp[:,c], hidden, cell)
                loss += criterion(outputs, target[:,c])

            loss.backward()
            optimizer.step()   
            loss = loss.item() / self.chunk_len

            if epoch % self.print_every == 0:
                print(f'Epoch {epoch}, Loss: {loss:.4f}')
                self.save_model()
                self.save_model_state_dict()
                self.generate()
                
            writer.add_scalar('Training Loss', loss, global_step = epoch)
        
    def save_model(self):
        print("=> Saving model")
        with open('/content/drive/MyDrive/Data_Files/LSTMmodel.pkl', 'wb') as f:
            pickle.dump(self.model, f)

    def save_model_state_dict(self):
        print("=> Saving model state dict")
        torch.save(self.model.state_dict(), '/content/drive/MyDrive/Data_Files/LSTMmodelSD')

initial_str = 'new'
prediction_len = 60
gennames = Generator(initial_str, prediction_len)
train_output = gennames.train()

=> Using cpu device
=> Starting training
Epoch 500, Loss: 2.0455
=> generated Lyrics : new i by thing i and wrown that the gored         in that the a
Cumulative 1-gram: 0.909091
Cumulative 2-gram: 0.624188
Cumulative 3-gram: 0.492445
Cumulative 4-gram: 0.418786
Epoch 1000, Loss: 1.8890
=> generated Lyrics : new i ve s be the hack you bay                                 
Cumulative 1-gram: 0.761905
Cumulative 2-gram: 0.681598
Cumulative 3-gram: 0.646873
Cumulative 4-gram: 0.623047
Epoch 1500, Loss: 1.3692
=> generated Lyrics : new the re to plate you sto arte befand the stand the under we 
Cumulative 1-gram: 0.733333
Cumulative 2-gram: 0.072375
Cumulative 3-gram: 0.035459
Cumulative 4-gram: 0.024072
Epoch 2000, Loss: 1.5320
=> generated Lyrics : new the will  i know the work                      cause to tur
Cumulative 1-gram: 0.968750
Cumulative 2-gram: 0.866025
Cumulative 3-gram: 0.782184
Cumulative 4-gram: 0.736872
Epoch 2500, Loss: 1.3950
=> generated Lyrics : newr whelle end you w

In [None]:
# tensorboard --logdir logs

# Lyrics generation by Logistic Regression

In [None]:
import pandas as pd
import numpy as np

In [None]:
#importing the data sets
from google.colab import drive
drive.mount('/content/drive')

#Colab Env
lyrics_file = pd.read_csv("/content/drive/MyDrive/Data_Files/taylor_swift_lyrics_new.csv")
#Juypter Env
# lyrics_file =  pd.read_csv("taylor_swift_lyrics_new.csv")
lyrics_file.head(2)

Mounted at /content/drive


Unnamed: 0,artist,album,track_title,track_n,lyric,line,year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006
1,Taylor Swift,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006


In [None]:
data = lyrics_file[50:150].lyric

In [None]:
X_train = [(x) for x in data]  

In [None]:
import tensorflow  as tf
from tensorflow import keras
# from tensorflow.keras import layers
tf.keras.preprocessing.text.Tokenizer
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', oov_token='<OOV>', lower=True)
lang_tokenizer.fit_on_texts(X_train)

In [None]:
# create input sequences using list of tokens
input_sequences = []
for line in X_train:
    token_list = lang_tokenizer.texts_to_sequences([line])[0]
#     print(token_list)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
#         print("ngs", n_gram_sequence)
        input_sequences.append(n_gram_sequence)

In [None]:
# pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(tf.keras.utils.pad_sequences(input_sequences, maxlen = max_sequence_len, padding='pre'))

In [None]:
from scipy import sparse as sp_sparse

In [None]:
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
X_train_sparse = sp_sparse.vstack([sp_sparse.csr_matrix(predictors)])
label_df = pd.DataFrame({'name':label})
y_train = label_df.values

In [None]:
X_train_sparse

<588x12 sparse matrix of type '<class 'numpy.int32'>'
	with 2327 stored elements in Compressed Sparse Row format>

In [None]:
tags_count = {}

for tags in y_train:
  for tag in tags:
    if tag in tags_count:
        tags_count[tag] += 1
    else:
        tags_count[tag] = 1
        
tags_counts = {i:tags_count[i] for i in tags_count if i!=0}

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [None]:
def train_classifier(X_train, y_train, C=1.0):
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.
    model = LogisticRegression(C=C,max_iter=400000)
    model = OneVsRestClassifier(model)
    model.fit(X_train, y_train)    
    return model 

In [None]:
classifier_lb = train_classifier(X_train_sparse, y_train, C=1.0)

In [None]:
def make_lyrics2(seed_text, next_words):
    for _ in range(next_words):
        token_list = lang_tokenizer.texts_to_sequences([seed_text])[0]
        token_list = tf.keras.utils.pad_sequences([token_list],
                     maxlen=max_sequence_len-1,padding='pre')
        #print(token_list)
        predicted_array = classifier_lb.predict( token_list)
        #print("array", predicted_array)
        
        output_word = ""
        for word, index in lang_tokenizer.word_index.items():
            if index == predicted_array[0]:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [None]:
output = make_lyrics2("its there summer",20)

In [None]:
output #Logistic Regression

"its there summer obvious should don't but a night guitar goes me star in in crazy in crazy in this in this world"