# Training LSTM for Song Lyrics Generation

In [None]:
#%pip install wandb

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [1]:
import os
import pandas as pd
import numpy as np
import wandb
import pickle
from tqdm import tqdm
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dropout, LSTM, Dense, Bidirectional
from keras.callbacks import EarlyStopping
from keras.models import Sequential

### Data Preprocessing

In [3]:
colab_data_path = "/content/drive/MyDrive/SongLyricsGeneration/data/"
colab_tokenizer_path = "/content/drive/MyDrive/SongLyricsGeneration/tokenizer/"

In [4]:
df = pd.read_csv(colab_data_path + "dataset.csv", index_col=[0])

In [5]:
df.head()

Unnamed: 0,Artist,Lyric
0,aaliyah,Mmm yeh yeh \n A special smile \n A certain to...
1,steven-tyler,"Right now, nothing else matters \n You and me ..."
2,the-beatles,"Gonna tell Aunt Mary 'bout Uncle John, \n He s..."
3,norah-jones,I'm lonely \n 'Cause I'm looking at pictures o...
4,dexys-midnight-runners,"No, I don't want sympathy, \n I just want some..."


In [6]:
len(df)

198071

Subsample the dataset.

In [7]:
df = df.sample(n=130000, random_state=42)

Tokenization.

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["Lyric"].astype(str).str.lower())
total_words = len(tokenizer.word_index) + 1
tokenized_sentences = tokenizer.texts_to_sequences(df["Lyric"].astype(str))

In [None]:
tokenizer_save_path = os.path.join(colab_tokenizer_path, 'tokenizer.pkl')
with open(tokenizer_save_path, 'wb') as f:
    pickle.dump(tokenizer, f)

Slash sequences into n gram sequence.

In [9]:
def generate_sequences(tokenized_sentences):
    for i in tqdm(tokenized_sentences, desc="Generating sequences"):
        for t in range(1, len(i)):
            n_gram_sequence = i[: t + 1]
            yield n_gram_sequence

Pre-padding.

In [10]:
sequence_generator = generate_sequences(tokenized_sentences)

# Find the maximum sequence length.
max_sequence_len = max(len(seq) for seq in tqdm(sequence_generator, desc="Calculating max sequence length"))

# Create a new generator for sequences.
sequence_generator = generate_sequences(tokenized_sentences)

# Pad sequences in smaller batches to save memory.
batch_size = 1000
padded_sequences = []

for batch in tqdm(iter(lambda: list(sequence_generator)[:batch_size], []), desc="Padding sequences"):
    padded_sequences.extend(keras.preprocessing.sequence.pad_sequences(batch, maxlen=max_sequence_len, padding="pre"))

input_sequences = np.array(padded_sequences)

Calculating max sequence length: 0it [00:00, ?it/s]
Calculating max sequence length: 109705it [00:00, 1096930.74it/s]
Calculating max sequence length: 225901it [00:00, 1135120.09it/s]
Calculating max sequence length: 343348it [00:00, 1153052.47it/s]
Calculating max sequence length: 461531it [00:00, 1164375.02it/s]
Calculating max sequence length: 577969it [00:00, 1163248.16it/s]
Calculating max sequence length: 699526it [00:00, 1180994.42it/s]
Calculating max sequence length: 820048it [00:00, 1188875.77it/s]
Calculating max sequence length: 938937it [00:00, 1179346.23it/s]
Calculating max sequence length: 1058477it [00:00, 1184306.48it/s]
Calculating max sequence length: 1176922it [00:01, 1178937.13it/s]
Calculating max sequence length: 1294829it [00:01, 1171361.77it/s]
Calculating max sequence length: 1415310it [00:01, 1181420.58it/s]
Calculating max sequence length: 1533472it [00:01, 1161352.71it/s]
Calculating max sequence length: 1650745it [00:01, 1164705.95it/s]
Calculating max se

Create predictors and labels.

In [11]:
X, labels = input_sequences[:, :-1], input_sequences[:, -1]
y = keras.utils.to_categorical(labels, num_classes=total_words)

dataset = dict()
dataset["features"] = X
dataset["labels"] = y

### Model training

In [13]:
colab_model_path = "/content/drive/MyDrive/SongLyricsGeneration/model/"

Configure the model.

In [14]:
model = Sequential()
model.add(Embedding(total_words, 40, input_length=max_sequence_len - 1))
model.add(Bidirectional(LSTM(250)))
model.add(Dropout(0.1))
model.add(Dense(total_words, activation="softmax"))
model.compile(
    loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

Set up training monitoring.

In [None]:
import wandb
wandb.login(key="")
run = wandb.init(project="Training LSTM for Lyrics Generation", job_type="training", anonymous="allow")

Set up callbacks.

In [None]:
earlystop = EarlyStopping(
        monitor="loss", min_delta=0, patience=3, verbose=0, mode="auto"
)
wandb_callback = wandb.keras.WandbCallback()

Train and save the model.

In [20]:
model.fit(
        dataset["features"],
        dataset["labels"],
        epochs=30,
        verbose=1,
        callbacks=[earlystop, wandb_callback],
    )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7dee5f3e52d0>

In [None]:
wandb.finish()

In [None]:
model.save(colab_model_path + "lstm_lyrics_generator.h5")