In [None]:
import os
import random
import shutil
import numpy as np
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from pandas import DataFrame
from keras import Sequential
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
SEED_VALUE = 42

# Fix seed to make training deterministic.
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)
tf.random.set_seed(SEED_VALUE)

# Dataset

In [None]:
root_dataset = "./dataset"

if os.getenv('COLAB_RELEASE_TAG'):
    from google.colab import drive 
    drive.mount('/content/gdrive')
    root_dataset = '/content/gdrive/MyDrive'
    
# descompress file
shutil.unpack_archive(
    filename=f'{root_dataset}/embeddings.zip', 
    extract_dir=f"embeddings"
)

root_dataset = "./embeddings"

In [None]:
import json

SETS = ['training', 'validation', 'tests']
GENRES = ['blues', 'classical', 'country', 'disco', \
    'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

def load_data(root_path: str, set: str):
    dataset = []

    for genre in GENRES:
        for file in os.listdir(f'{root_path}/lyrics/{set}/{genre}'):
            embedding = []
            for feature in ['lyrics', 'mfcc']:    
                with open(f'{root_path}/{feature}/{set}/{genre}/{file}', "r") as f:
                    jd = json.load(f)
                    embedding.extend(jd)
            dataset.append([embedding, genre])

    df = DataFrame(
        data=np.array(dataset, dtype=object), 
        columns=['embedding', 'genre']
    )

    one_hot = pd.get_dummies(df['genre'])

    df = pd.concat([df, one_hot], axis=1)
    df.drop(['genre'], axis=1, inplace=True)

    return (np.array([tf.convert_to_tensor(emb) for emb in df['embedding']]), df[GENRES])


In [None]:
X_train, y_train = load_data('./embeddings', 'training')
X_val, y_val = load_data('./embeddings', 'validation')

# Training and Cross Validation

In [None]:
model = Sequential()
model.add(Dense(units=128, activation='relu', input_shape=(886,)))
model.add(Dropout(0.5))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=10, activation='softmax'))
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=250)
best_model_path = '/content/gdrive/MyDrive/model_storage/best_model.keras'
model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)

history = model.fit(x=X_train, y=y_train, 
                    validation_data=(X_val, y_val),
                    epochs=500, 
                    batch_size=100,
                    shuffle=True,
                    callbacks=[early_stopping, model_checkpoint])

model.save(best_model_path)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
X_test, y_test = load_data('./embeddings', 'tests')

loss, accuracy = model.evaluate(X_test, y_test, verbose=1)

print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

In [None]:
model = tf.keras.models.load_model(best_model_path)

y_pred = model.predict(X_test)

y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test.to_numpy(), axis=1)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
sns.heatmap(cm, annot=True)
plt.ylabel('Prediction', fontsize=13)
plt.xlabel('Actual', fontsize=13)
plt.title('Confusion Matrix', fontsize=17)
plt.show()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)