## Notebook config

mode = 'train' will load the data from train.csv and test.csv, train multiple models with a variety of hyperparamters, evaluate their performance and save the top two performing models

mode = 'demo' will load the previously saved top two performing models and will demonstrate their performance against data from test.csv

In [None]:
mode = 'train'
# mode = 'demo'

## Import dependencies

In [None]:
# Import libraries and set default options
import datetime
import calendar
import random
import math
import time
import pandas as pd
import numpy as np
from array import array
import pickle

import matplotlib.pyplot as plt
import seaborn as sb

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import NMF, PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.model_selection import cross_validate

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, SimpleRNN, Input, concatenate, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pmdarima as pm
from pmdarima.model_selection import train_test_split as pmd_train_test_split

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [None]:
def loadTrainingData(limitSize = True, limit = 1000):
    df_train = pd.read_csv('train.csv')
    df_train = df_train.dropna()

    if(limitSize):
        return df_train[:limit]
    else:
        return df_train

def loadTestData():
    df_test = pd.read_csv('test.csv')
    df_test = df_test.dropna()

    return df_test

In [None]:
def splitArtistLyricsAndGenres(df_train, df_test):
    lyrics_train = df_train['Lyrics'].values
    genres_train = df_train['Genre'].values
    artist_train = df_train['Artist'].values

    lyrics_test = df_test['Lyrics'].values
    genres_test = df_test['Genre'].values
    artist_test = df_test['Artist'].values

    return artist_train, lyrics_train, genres_train, artist_test, lyrics_test, genres_test

In [None]:
def generateEncodedPaddedSequences(artist_train, lyrics_train, genres_train, lyrics_test, artist_test, genres_test, max_length):
    # ngram_range = 3
    artist_tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>') # Adjust num_words as needed
    artist_tokenizer.fit_on_texts(np.union1d(artist_train, artist_test))
    encoded_artist_train = artist_tokenizer.texts_to_sequences(artist_train)
    encoded_artist_test = artist_tokenizer.texts_to_sequences(artist_test)
    
    lyric_tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
    lyric_tokenizer.fit_on_texts(np.union1d(lyrics_train, lyrics_test))
    
    lyric_sequences_train = lyric_tokenizer.texts_to_sequences(lyrics_train)
    lyric_sequences_test = lyric_tokenizer.texts_to_sequences(lyrics_test)
    
    genre_tokenizer = Tokenizer(filters='!"#$%()*+,./:;<=>?@[\\]^_`{|}~\t\n') #filters out symbols from genres e.g. R&B, Hip-Hop
    genre_tokenizer.fit_on_texts(np.union1d(genres_train, genres_test))
    genre_index = genre_tokenizer.word_index
    encoded_genres_train = genre_tokenizer.texts_to_sequences(genres_train)
    encoded_genres_test = genre_tokenizer.texts_to_sequences(genres_test)
    
    # max_length = 500 # Adjust as needed
    padded_sequences_train = pad_sequences(lyric_sequences_train, maxlen=max_length, truncating='post')
    padded_sequences_test = pad_sequences(lyric_sequences_test, maxlen=max_length, truncating='post')

    encoded_artist_train = pad_sequences(encoded_artist_train, maxlen=10, truncating='post')
    encoded_artist_test = pad_sequences(encoded_artist_test, maxlen=10, truncating='post')
    encoded_artist_array_train = np.array(encoded_artist_train)
    encoded_artist_array_test = np.array(encoded_artist_test)
    
    padded_sequences_array_train = np.array(padded_sequences_train)
    padded_sequences_array_test = np.array(padded_sequences_test)
    
    encoded_genres_array_train = np.array(encoded_genres_train)
    encoded_genres_array_test = np.array(encoded_genres_test)
    
    return encoded_artist_array_train, encoded_artist_array_test, padded_sequences_array_train, padded_sequences_array_test, encoded_genres_array_train, encoded_genres_array_test, genre_index

In [None]:
def compileModel(embeddingInputDim, embeddingOutputDim, layers, activationFunction, lossFunction, optimizerFunction, max_length, genre_index, include_artist):
    # model = None # probably don't need this anymore
    
    model = Sequential()

    if(not include_artist):
        model.add(Embedding(embeddingInputDim, embeddingOutputDim, input_length=max_length))
        for layer in layers:
            model.add(layer)
        model.add(Dense(len(genre_index) + 1, activation=activationFunction)) 
    else:
        lyrics_input = Input(shape=(max_length,))
        lyrics_embedding = Embedding(embeddingInputDim, embeddingOutputDim, input_length=max_length)(lyrics_input)
        lyrics_lstm = LSTM(64)(lyrics_embedding)
        artist_input = Input(shape=(10,))
        artist_embedding = Embedding(1000, 32)(artist_input)
        artist_flatten = Flatten()(artist_embedding)
        concatenated = concatenate([lyrics_lstm, artist_flatten])
        output = Dense(len(genre_index) + 1, activation=activationFunction)(concatenated)
        model = Model(inputs=[lyrics_input, artist_input], outputs=output)
    
    
    model.compile(loss=lossFunction, optimizer=optimizerFunction, metrics=['accuracy'])

    return model

In [None]:
def trainModel(model, batch_size, encoded_artist_array_train, padded_sequences_array_train, encoded_genres_array_train, epochs, include_artist):
    batch_size = 32
    
    if(not include_artist):
        history = model.fit(padded_sequences_array_train, encoded_genres_array_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)
    else:
        history = model.fit([padded_sequences_array_train,encoded_artist_array_train], encoded_genres_array_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

    return history

In [None]:
def plotTrainingAndValidationLoss(train_loss, val_loss, modelConfig ):
    plt.plot(train_loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')
    # modelConfig["plots"].append(plt)
    plt.show()
    

def plotTrainingAndValidationAccuracy(train_acc, val_acc, modelConfig ):
    plt.plot(train_acc, label='Training Accuracy')
    plt.plot(val_acc, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Training and Validation Accuracy')
    # modelConfig["plots"].append(plt)
    plt.show()

def plotModel(history, modelConfig):
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    train_acc = history.history['accuracy'] 
    val_acc = history.history['val_accuracy']

    # modelConfig["plots"].append(plotTrainingAndValidationLoss(train_loss, val_loss, modelConfig))
    # modelConfig["plots"].append(plotTrainingAndValidationAccuracy(train_acc, val_acc, modelConfig))
    plotTrainingAndValidationLoss(train_loss, val_loss, modelConfig)
    plotTrainingAndValidationAccuracy(train_acc, val_acc, modelConfig)


In [None]:
def evaluateModel(model, encoded_artist_array_test, padded_sequences_array_test, encoded_genres_array_test, include_artist):
    
    if(not include_artist):
        test_loss, test_acc = model.evaluate(padded_sequences_array_test, encoded_genres_array_test)
    else:
        test_loss, test_acc = model.evaluate([padded_sequences_array_test, encoded_artist_array_test], encoded_genres_array_test)

    return test_loss, test_acc

In [None]:
def confusionMatrix(modelConfig, encoded_artist_array_test, padded_sequences_array_test, encoded_genres_array_test, genre_labels, include_artist):
    model = modelConfig['model']
    if(not include_artist):
        predictions = model.predict(padded_sequences_array_test)
    else:
        predictions = model.predict([padded_sequences_array_test,encoded_artist_array_test])
    predicted_labels = tf.argmax(predictions, axis=1)
    cm = confusion_matrix(encoded_genres_array_test, predicted_labels) 

    plt.figure(figsize=(8, 6))
    sb.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=genre_labels, yticklabels=genre_labels)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    # modelConfig['cm_plots'].append(plt)
    plt.show()

    return cm

In [None]:
def loadModelConfigs():
    modelConfigs = []

    modelConfig1 = {
        "name": "1 Single LSTM 64 Layer - scc - softmax - adam",
        "includeArtist": False,
        "embeddingInputDim": 5000,
        "embeddingOutputDim": 128,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            LSTM(64)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    modelConfig2 = {
        "name": "2 Single LSTM 128 Layer - scc - softmax - adam",
        "includeArtist": False,
        "embeddingInputDim": 5000,
        "embeddingOutputDim": 128,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            LSTM(128)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    modelConfig3 = {
        "name": "3 Double LSTM 64 Layer dropout - scc - softmax - adam",
        "includeArtist": False,
        "embeddingInputDim": 5000,
        "embeddingOutputDim": 128,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            LSTM(units=64, dropout=0.2, return_sequences=True),
            LSTM(64, dropout=0.2)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    modelConfig4 = {
        "name": "4 Double LSTM 128 Layer Dropout - scc - softmax - adam",
        "includeArtist": False,
        "embeddingInputDim": 5000,
        "embeddingOutputDim": 128,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            LSTM(units=128, dropout=0.2, return_sequences=True),
            LSTM(128, dropout=0.2)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    modelConfig5 = {
        "name": "5 Single LSTM 128 Layer - cc - softmax - adam",
        "includeArtist": False,
        "embeddingInputDim": 5000,
        "embeddingOutputDim": 128,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            LSTM(128)
        ],
        "lossFunction": 'categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    modelConfig6 = {
        "name": "6 Single LSTM 128 Layer - scc - softmax - SGD",
        "includeArtist": False,
        "embeddingInputDim": 5000,
        "embeddingOutputDim": 128,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            LSTM(128)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'sgd',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }

    modelConfig7 = {
        "name": "7 Single LSTM 64 Layer - scc - relu - adam",
        "includeArtist": False,
        "embeddingInputDim": 5000,
        "embeddingOutputDim": 128,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            LSTM(64)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'relu',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    modelConfig8 = {
        "name": "8 Triple LSTM 128 Layer Dropout - scc - softmax - adam",
        "includeArtist": False,
        "embeddingInputDim": 5000,
        "embeddingOutputDim": 128,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            LSTM(units=128, dropout=0.2, return_sequences=True),
            LSTM(units=128, dropout=0.2, return_sequences=True),
            LSTM(128, dropout=0.2)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    modelConfig9 = {
        "name": "9 Single RNN 64 Layer - scc - softmax - adam",
        "includeArtist": False,
        "embeddingInputDim": 1000,
        "embeddingOutputDim": 64,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            SimpleRNN(64)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    modelConfig10 = {
        "name": "10 Simple RNN 64 before LSTM 64 Layer - scc - softmax - adam",
        "includeArtist": False,
        "embeddingInputDim": 1000,
        "embeddingOutputDim": 64,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            SimpleRNN(64, return_sequences=True),
            LSTM(64)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    modelConfig11 = {
        "name": "11 Single LSTM 64 Layer 20 epochs - scc - softmax - adam",
        "includeArtist": False,
        "embeddingInputDim": 5000,
        "embeddingOutputDim": 128,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 20,
        "layers": [
            LSTM(64)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    modelConfig12 = {
        "name": "12 Single LSTM 64 Layer Include Artist - scc - softmax - adam",
        "includeArtist": True,
        "embeddingInputDim": 5000,
        "embeddingOutputDim": 128,
        "max_length": 500,
        "batch_size": 32,
        "epochs": 10,
        "layers": [
            LSTM(64)
        ],
        "lossFunction": 'sparse_categorical_crossentropy',
        "activationFunction": 'softmax',
        "optimizerFunction": 'adam',
        "plots": [], "cm_plots": [], "model": None, "history": None, "test_loss" : None, "test_acc" : None
    }
    

    


    modelConfigs.append(modelConfig1)
    modelConfigs.append(modelConfig2)
    modelConfigs.append(modelConfig3)
    modelConfigs.append(modelConfig4)
    # # modelConfigs.append(modelConfig5)
    # # modelConfigs.append(modelConfig6)
    # modelConfigs.append(modelConfig7)
    # # modelConfigs.append(modelConfig8)
    # # modelConfigs.append(modelConfig9)
    # # modelConfigs.append(modelConfig10)
    modelConfigs.append(modelConfig11)
    modelConfigs.append(modelConfig12)

    return modelConfigs

In [None]:

# df_train = loadTrainingData(limitSize = True, limit = 1000)
df_train = loadTrainingData(limitSize = False, limit = 0)
df_test = loadTestData()

print(f"Training for: {len(df_train)} rows")
print(f"Validating for: {len(df_test)} rows")

max_length = 500
artist_train, lyrics_train, genres_train, artist_test, lyrics_test, genres_test = splitArtistLyricsAndGenres(df_train, df_test)
encoded_artist_array_train, encoded_artist_array_test, padded_sequences_array_train, padded_sequences_array_test, encoded_genres_array_train, encoded_genres_array_test, genre_index = generateEncodedPaddedSequences(artist_train, lyrics_train, genres_train, artist_test, lyrics_test, genres_test, max_length)
genre_labels = list(genre_index.keys())
print(f"Encoding and Sequencing completed\n\n")

modelConfigs = loadModelConfigs()
for modelConfig in modelConfigs:
    print(f"Model: {modelConfig['name']}")
    print(f"Compiling: {modelConfig['name']}")
    modelConfig['model'] = compileModel(
        embeddingInputDim=modelConfig['embeddingInputDim'],
        embeddingOutputDim=modelConfig['embeddingOutputDim'],
        layers=modelConfig['layers'],
        activationFunction=modelConfig['activationFunction'],
        lossFunction=modelConfig['lossFunction'],
        optimizerFunction=modelConfig['optimizerFunction'],
        max_length=max_length,
        genre_index=genre_index,
        include_artist = modelConfig['includeArtist']
    )

    print(f"Training: {modelConfig['name']}")
    modelConfig['history'] = trainModel(
        model=modelConfig['model'],
        batch_size=modelConfig['model'],
        encoded_artist_array_train = encoded_artist_array_train, 
        padded_sequences_array_train=padded_sequences_array_train,
        encoded_genres_array_train=encoded_genres_array_train,
        epochs=modelConfig['epochs'],
        include_artist = modelConfig['includeArtist']
    )

    print(f"Plotting: {modelConfig['name']}")
    plotModel(modelConfig['history'], modelConfig)

    print(f"Confusion Matrix: {modelConfig['name']}")
    cm = confusionMatrix(modelConfig, encoded_artist_array_test, padded_sequences_array_test, encoded_genres_array_test, genre_labels, modelConfig['includeArtist'])
    print(cm)

    print(f"Evaluating: {modelConfig['name']}")
    test_loss, test_acc = evaluateModel(
        model=modelConfig['model'],
        encoded_artist_array_test = encoded_artist_array_test, 
        padded_sequences_array_test=padded_sequences_array_test,
        encoded_genres_array_test=encoded_genres_array_test,
        include_artist = modelConfig['includeArtist']
    )

    modelConfig['test_loss'] = test_loss
    modelConfig['test_acc'] = test_acc

    print(f"Training complete. Pickling results")
    modelFileName = 'saves/' + modelConfig['name'] + '.pkl'
    modelFile = open(modelFileName, 'ab')
    pickle.dump(modelConfig, modelFile)                    
    modelFile.close()

    kerasFileName = 'saves/' + modelConfig['name'] + '.keras'
    modelConfig['model'].save(kerasFileName)
    print(f"\n\n")
        


In [None]:
# modelConfig

In [None]:
# modelFileName = 'saves/' + modelConfig['name'] + '.pkl'
# modelFileName = 'smtest.pkl'
# modelFileName

In [None]:
modelFileName = 'saves/allModelsRun1.pkl'
modelFile = open(modelFileName, 'ab')
pickle.dump(modelConfigs, modelFile)                    
modelFile.close()