## Lyrics Generation Using RNN

### Imports

In [None]:
import os
import random
import pretty_midi
import numpy as np
import pandas as pd
import re
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, Embedding, Concatenate, Reshape
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Global variables

In [None]:
GLOVE_PATH = "glove.6B.300d.txt"
TRAIN_PATH = "lyrics_train_set.csv"
TEST_PATH = "lyrics_test_set.csv"
MIDI_PATH = "midi_files"
MIDI_PKL_PATH = "midi_df.pkl"

WORD_DIM = 300
MAX_SEQUENCE_LENGTH = 14

### Functions for loading the dataset and creating the word2vec representations

In [None]:
def load_dataset(path):
    df = pd.read_csv(path, header=None)
    df.fillna("", inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(inplace=True)
    # Merge the lyrics columns
    lyrics_cols = df.iloc[:, 3:].copy()
    df = df.rename(columns={0: "artist", 1: "title", 2: "lyrics"})
    df["lyrics"] = lyrics_cols.apply("".join, axis=1)
    df = df[["artist", "title", "lyrics"]]
    # Cleans the lyrics
    df["lyrics"] = df["lyrics"].apply(
        lambda lyrics: re.sub("[^a-zA-Z &]", "", lyrics).lower().strip()
    )
    df["lyrics"] = df["lyrics"].str.replace("chorus", "")
    df["lyrics"] = df["lyrics"].str.replace("verse", "")
    df["lyrics"] = df["lyrics"].str.replace("instrumental", "")
    df["lyrics"] = df["lyrics"].apply(lambda lyrics: re.sub(r"\s+", " ", lyrics))

    return df


def load_glove(path):
    word2vec = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line_vals = line.split(" ")
            word = line_vals[0]
            if word.isalpha():
                vector = np.asarray(line_vals[1:], "float32")
                word2vec[word] = vector

    return word2vec


def get_embedding_mat(word2vec, tokenizer, vocab_size, dim):
    embedding_mat = np.random.rand(vocab_size, dim)
    for word, ind in tokenizer.word_index.items():
        vec = word2vec.get(word)
        if vec is not None:
            embedding_mat[ind] = vec

    return embedding_mat


def get_vocab_size(df):
    lyrics = df["lyrics"].str.cat()
    words = lyrics.replace("sfin", "").split(" ")
    vocab = set(words)

    return len(vocab) + 1  # Added 1 for unknown word

### Functions for loading the midi data and vectorizing it

In [None]:
def get_midi_dict(path):
    midi_dict = {}
    for file_name in os.listdir(path):
        temp = "" + file_name
        file_name = file_name[:-4].lower()
        artist, title = file_name.replace("_", " ").split("-", maxsplit=1)
        if "-" in title:
            title = title.split("-", maxsplit=1)[0]  # To clean song name
        artist = artist.strip()
        title = title.strip()
        midi_dict[(artist, title)] = temp

    return midi_dict


def load_midi_file(file_name):
    midi = None
    try:
        midi = pretty_midi.PrettyMIDI(os.path.join(MIDI_PATH, file_name))
    except:
        print(f"Unable to create PrettyMIDI object from the song {file_name}")

    return midi


def get_midi_vec(midi_data):
    bpm = np.array([midi_data.estimate_tempo() / 360])
    chroma = np.array(midi_data.get_chroma().sum(axis=1) / midi_data.get_chroma().sum())
    pitch_hist = np.array(midi_data.get_pitch_class_histogram())
    piano_roll = np.array(midi_data.get_piano_roll().sum(axis=1) / midi_data.get_piano_roll().sum())
    pitch_mat = np.array(midi_data.get_pitch_class_transition_matrix(normalize=True)).reshape(-1)

    full_vector = np.hstack((bpm, chroma, pitch_hist, piano_roll, pitch_mat))
    np.nan_to_num(x=full_vector, copy=False)  # Replace NaN values with 0.0

    return full_vector


def create_midi_df(midi_dict):
    temp = {"artist-title": [], "vector": []}
    for key, value in midi_dict.items():
        pm = load_midi_file(value)
        if pm is not None:
            name = str(key[0]) + "-" + str(key[1])
            temp["artist-title"].append(name)
            temp["vector"].append(get_midi_vec(pm))
    midi_df = pd.DataFrame.from_dict(temp)

    return midi_df

### Function for splitting the data

In [None]:
def train_val_split(features, labels, seq_indices, midi_data):
    train_features = []
    train_labels = []
    train_midis = []
    val_features = []
    val_labels = []
    val_midis = []

    inds = list(seq_indices.keys())
    random_indices = random.choices(inds, k=len(inds) // 5)

    for ind in inds:
        if ind not in random_indices:
            for i in seq_indices[ind]:
                train_features.append(features[i])
                train_labels.append(labels[i])
                train_midis.append(midi_data[ind])
        else:
            for i in seq_indices[ind]:
                val_features.append(features[i])
                val_labels.append(labels[i])
                val_midis.append(midi_data[ind])

    return (
        np.array(train_features),
        np.array(train_labels),
        np.array(train_midis),
        np.array(val_features),
        np.array(val_labels),
        np.array(val_midis),
    )

### The models

In [None]:
def get_simple_model(vocab_size, embedding_mat, max_seq_size, midi_vec_size):
    lyrics_input = Input((max_seq_size,))
    midi_input = Input((midi_vec_size,))

    embed_layer = Embedding(
        input_dim=vocab_size,
        output_dim=WORD_DIM,
        weights=[embedding_mat],
        input_length=max_seq_size,
        trainable=False,
    )(lyrics_input)
    lstm_layer = Bidirectional(LSTM(units=8))(embed_layer)
    concat_layer = Concatenate(axis=-1)([midi_input, lstm_layer])
    output_layer = Dense(vocab_size, activation="softmax")(concat_layer)

    model = Model(inputs=[lyrics_input, midi_input], outputs=output_layer)
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

    return model


# Creates the advanced model
def get_advanced_model(vocab_size, embedding_mat, max_seq_size, midi_vec_size):
    lyrics_input = Input((max_seq_size,))
    midi_input = Input((midi_vec_size,))

    embed_layer = Embedding(
        input_dim=vocab_size,
        output_dim=WORD_DIM,
        weights=[embedding_mat],
        input_length=max_seq_size,
        trainable=False,
    )(lyrics_input)
    midi_reshape = Reshape((1, -1))(midi_input)
    lstm_layer1midi = Bidirectional(LSTM(units=128, return_sequences=True))(midi_reshape)
    lstm_layer2midi = Bidirectional(LSTM(units=128))(lstm_layer1midi)
    lstm_layer1 = Bidirectional(LSTM(units=128, return_sequences=True))(embed_layer)
    lstm_layer2 = Bidirectional(LSTM(units=128))(lstm_layer1)
    concat_layer = Concatenate(axis=-1)([lstm_layer2midi, lstm_layer2])
    dense_layer = Dense(1024, activation="relu")(concat_layer)
    dropout_layer = Dropout(0.3)(dense_layer)
    output_layer = Dense(vocab_size, activation="softmax")(dropout_layer)

    model = Model(inputs=[lyrics_input, midi_input], outputs=output_layer)
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

    return model

### Training the models

In [None]:
df = load_dataset(TRAIN_PATH)
df["artist-title"] = df.apply(lambda row: str(row["artist"]) + "-" + str(row["title"]), axis=1)
df["lyrics"] = df["lyrics"].apply(lambda row: row.replace("&", " sfin "))

midi_dict = get_midi_dict(MIDI_PATH)

try:
    midi_df = pd.read_pickle(MIDI_PKL_PATH)
except:
    midi_df = create_midi_df(midi_dict)
    midi_df.to_pickle(MIDI_PKL_PATH)


merged_df = pd.merge(df, midi_df, how="inner", on="artist-title")

vocab_size = get_vocab_size(merged_df)
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(merged_df["lyrics"].tolist())
sequences = tokenizer.texts_to_sequences(merged_df["lyrics"].tolist())
word_index = tokenizer.word_index

seq_indices = {}
seq_list = []

count = 0
for ind, sequence in enumerate(sequences):
    seq_indices[ind] = []
    for j in range(1, len(sequence)):
        for z in range(MAX_SEQUENCE_LENGTH):
            seq = sequence[j : j + z + 2]
            seq_list.append(np.array(seq))
            seq_indices[ind].append(count)
            count += 1

max_seq_len = max([len(seq) for seq in seq_list])
padded_sequences = pad_sequences(seq_list, maxlen=max_seq_len, padding="pre")
padded_sequences = np.array(padded_sequences)

features = padded_sequences[:, :-1]
labels = padded_sequences[:, -1]
midi_data = merged_df["vector"].values

word2vec = load_glove(GLOVE_PATH)
embed_mat = get_embedding_mat(word2vec, tokenizer, vocab_size, WORD_DIM)


train_features, train_labels, train_midis, val_features, val_labels, val_midis = train_val_split(
    features, labels, seq_indices, midi_data
)

# Only the piano roll portion of the midi vector
simple_midis = np.array([midi[25:-144] for midi in train_midis])
simple_val_midis = np.array([midi[25:-144] for midi in val_midis])

simple_model = get_simple_model(vocab_size, embed_mat, MAX_SEQUENCE_LENGTH, simple_midis.shape[1])

adv_model = get_advanced_model(vocab_size, embed_mat, MAX_SEQUENCE_LENGTH, train_midis.shape[1])

simple_model.fit(
    [train_features, simple_midis],
    train_labels,
    batch_size=256,
    epochs=1,
    validation_data=([val_features, simple_val_midis], val_labels),
)

adv_model.fit(
    [train_features, train_midis],
    train_labels,
    batch_size=256,
    epochs=1,
    validation_data=([val_features, val_midis], val_labels),
)

### Functions for word prediction, lyrics generating and printing songs

In [None]:
# Predicts the next word
def predict_word(model, sequence, midi_data):
    word_amount = 4
    pred_words = model.predict([np.array(sequence).reshape(1, -1), midi_data.reshape(1, -1)], verbose = 0)
    # Chooses a word_amount of the best words
    best_words = (-pred_words[0]).argsort()[:word_amount]
    # Gets the prob for each of the best words
    best_probs = pred_words[0][best_words]
    # Normalizes the prob of the best words
    norm_probs = [prob / sum(best_probs) for prob in best_probs]
    # Chooses a random word out of the best words, based on their probs
    chosen_word = np.random.choice(best_words, 1, p=norm_probs)[0]

    return chosen_word


# Generates lyrics by using the model to predict the next word
def generate_lyrics(model, initial_word, midi_data, lyrics_length):
    pred_sentence = []
    pred_sentence.append(initial_word)
    input_sequence = np.zeros(MAX_SEQUENCE_LENGTH)
    input_sequence[-1] = initial_word
    for _ in range(lyrics_length - 1):
        next_word = predict_word(model, input_sequence, midi_data)
        input_sequence = np.roll(input_sequence, -1)
        input_sequence[-1] = next_word
        pred_sentence.append(next_word)

    return pred_sentence


# Prints a song
def print_song(song, midi_ind):
    for seq in song:
        print("----------------------------------------------------------------------")
        print(f"Song starts with '{seq[0].split(' ')[0]}', melody #{midi_ind+1}: ")
        for sentence in seq:
            print(sentence.strip())

### Generating lyrics based on the test set

In [None]:
#Loads the test set
test_df = load_dataset(TEST_PATH)
test_sequences = tokenizer.texts_to_sequences(test_df["lyrics"].tolist())

# Chooses 3 initial words
test_input_lyrics = [seq[0] for seq in test_sequences]
test_input_lyrics = random.sample(test_input_lyrics, k=3)

# Gets the vectorized form of each midi file
test_input_midis = []
for row in test_df.itertuples():
    try:
        artist_title = str(row[1]).strip() + "-" + str(row[2]).strip()
        test_input_midis.append(
            midi_df.loc[midi_df["artist-title"] == artist_title, "vector"].squeeze()
        )
    except:
        test_input_midis.append(None)

# Takes only part of the vector (piano roll) for the simple model
simple_test_midis = [midi[25:-144] for midi in test_input_midis]

LYRICS_PER_SONG = 112
texts_smp = []
texts_adv = []
songs_smp = []
songs_adv = []

# Generating the lyrics using the trained models
for melody in range(len(test_input_midis)):
    lyrics_smp = []
    lyrics_adv = []
    for init_word in range(len(test_input_lyrics)):
        lyrics_smp.append(
            generate_lyrics(
                simple_model, test_input_lyrics[init_word], simple_test_midis[melody], lyrics_length=LYRICS_PER_SONG
            )
        )
        lyrics_adv.append(
            generate_lyrics(
                adv_model, test_input_lyrics[init_word], test_input_midis[melody], lyrics_length=LYRICS_PER_SONG
            )
        )
    texts_smp.append(tokenizer.sequences_to_texts(np.array(lyrics_smp)))
    texts_adv.append(tokenizer.sequences_to_texts(np.array(lyrics_adv)))

# Seperate the lyrics to sentences
for melody in range(len(test_input_midis)):
    songs_smp.append([text.split("sfin") for text in texts_smp[melody]])
    songs_adv.append([text.split("sfin") for text in texts_adv[melody]])

# Print the songs generated by the simple model
print("************************SIMPLE SONGS*****************************")
for i, song in enumerate(songs_smp):
    print_song(song, i)

# Print the songs generated by the advanced model
print("************************ADVANCED SONGS*****************************")
for i, song in enumerate(songs_adv):
    print_song(song, i)