In [7]:
import numpy as np
import struct
from array import array
import pandas as pd
import os
from os.path  import join
import random as rn
from sklearn.neural_network import MLPClassifier
import graphviz
import pydotplus
from IPython.display import Image
from io import StringIO
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import log_loss
import random
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
import time
import itertools
import matplotlib as mpl
from sklearn.model_selection import GridSearchCV

from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from scipy.stats import kurtosis
from sklearn.decomposition import FastICA
from sklearn.random_projection import SparseRandomProjection

In [2]:
"""
Loading in the dataset into a pandas dataframe object.

For the following segments the code snippets were retreved from: https://www.kaggle.com/code/anetakovacheva/interpreting-a-music-genre-classifier
"""

%matplotlib inline

input_path = 'data/musicgenre_datafolder'
file_path = join(input_path, 'music_genre.csv')

music_data = pd.read_csv(file_path)

"""
Cleaning and Pre-Processing all of the data
"""

"""
There are some duplicated data that needs to be cleaned up
"""
music_data.duplicated().any()
duplicated = music_data.duplicated()
music_data[duplicated]
music_data.iloc[9999:10006]
music_data.drop([10000, 10001, 10002, 10003, 10004], inplace = True)

"""
Removing some columns that don't matter or will complicated the training too much
"""
music_data.reset_index(inplace = True)
music_data = music_data.drop(["artist_name", "index", "instance_id", "track_name", "obtained_date"], axis = 1)

"""
Normalizing the music data such that it removes invalid values for 'tempo' and converts
the column values into a float
"""
music_data = music_data.drop(music_data[music_data["tempo"] == "?"].index)
music_data["tempo"] = music_data["tempo"].astype("float")
music_data["tempo"] = np.around(music_data["tempo"], decimals = 2)

"""
Encoding the columns that are strings with LabelEncoder since this will mess
up the algorithms that require numeric values
"""
key_encoder = LabelEncoder()
mode_encoder = LabelEncoder()
music_data["key"] = key_encoder.fit_transform(music_data["key"])
music_data["mode"] = mode_encoder.fit_transform(music_data["mode"])

"""
Separating out the column features from the music genre label
"""
music_features = music_data.drop("music_genre", axis = 1)
music_labels = music_data["music_genre"]

print(music_data)
print(np.unique(music_labels))
print(len(np.unique(music_labels)))

"""
Scaling the features out into a scale centered around 0 with a standard deviation of 1
"""
scaler = StandardScaler()
music_features_scaled = scaler.fit_transform(music_features)
# print(music_features_scaled[0])

"""
Splitting the data into Training and Testing Data Sets
"""
train_features, test_features, train_labels, test_labels = train_test_split(
    music_features_scaled, music_labels, test_size = 0.1, stratify = music_labels)


       popularity  acousticness  danceability  duration_ms  energy  \
0            27.0       0.00468         0.652         -1.0   0.941   
1            31.0       0.01270         0.622     218293.0   0.890   
2            28.0       0.00306         0.620     215613.0   0.755   
3            34.0       0.02540         0.774     166875.0   0.700   
4            32.0       0.00465         0.638     222369.0   0.587   
...           ...           ...           ...          ...     ...   
49995        59.0       0.03340         0.913         -1.0   0.574   
49996        72.0       0.15700         0.709     251860.0   0.362   
49997        51.0       0.00597         0.693     189483.0   0.763   
49998        65.0       0.08310         0.782     262773.0   0.472   
49999        67.0       0.10200         0.862     267267.0   0.642   

       instrumentalness  key  liveness  loudness  mode  speechiness   tempo  \
0               0.79200    1     0.115    -5.201     1       0.0748  100.89   
1

In [3]:
"""
Setting up seed values for reproducability
"""
seed = 1234
np.random.seed(seed)
rn.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [4]:
"""
Getting the size values for the training and testing dataset
"""
size_train_samples = np.shape(train_features)[0]
size_test_samples = np.shape(test_features)[0]

In [None]:
"""
Actually training the MLP Classifier to generate the graph for the learning graph visualizer.
This should utilize the percentage of samples lists that use randomly selected samples of the
overall training data. This is so that we can test the accuracy (using cross-validation) across different
training data size samples to see at what point the accuracy score stops being affected by the size of the training
samples. This is also to see at what point the data might start getting overfit.

This is to generate the Accuracy Learning Curve.
"""
# Defining hyperparameters here
hidden_layer_sizes = [100, 50, 20, 10]
activation = 'tanh'
learning_rate = 'constant'
max_iter = 1 # Setting this to 1 since we want to control the epochs ourselves
warm_start = True # This is to stack the training across different epochs

number_of_epochs = 50

# lists to hold the results of training / validation scores
x_axis_list = []
avg_train_scores_list_reg = []
avg_validation_scores_list_reg = []
avg_train_loss_values_reg = []
avg_validation_loss_values_reg = []

# First declaring the Decision Tree Classifer from scikit-learn
clf = MLPClassifier(
    hidden_layer_sizes=hidden_layer_sizes,
    activation=activation,
    learning_rate=learning_rate,
    max_iter=max_iter,
    warm_start=warm_start,
    random_state=seed
)

for epoch_iteration in range(1, number_of_epochs + 1):
    
    # cross_val_score doesn't increase across epoch runs for some reason so I need to split it myself
    train_data, val_data, train_label, val_label = train_test_split(train_features, train_labels, test_size=0.2, random_state=seed)
    
    clf.fit(train_data, train_label)
    
    accuracy_score = clf.score(train_data, train_label)
    validation_score = clf.score(val_data, val_label)
#     loss_score = clf.loss_
    train_loss_score = log_loss(train_label, clf.predict_proba(train_data))
    val_loss_score = log_loss(val_label, clf.predict_proba(val_data))
    
    x_axis_list.append(epoch_iteration)
    avg_train_scores_list_reg.append(accuracy_score)
    avg_validation_scores_list_reg.append(validation_score)
    avg_train_loss_values_reg.append(train_loss_score)
    avg_validation_loss_values_reg.append(val_loss_score)
    
    print("=============================================")
    print("Run for " + str(epoch_iteration) + " epoch")
    print("Training Score: " + str(accuracy_score))
    print("Validation Score: " + str(validation_score))
#     print("Loss Score: " + str(loss_score))
    print("Training Loss Score: " + str(train_loss_score))
    print("Validation Loss Score: " + str(val_loss_score))
    print("=============================================")

    

In [None]:
"""
Running PCA DR on the dataset and then feeding the DR dataset to the neural network.
"""
svd_solver = "full"
pcafinal = PCA(n_components = 10, svd_solver = svd_solver)

pcafinal.fit(train_features)

x_train_transformed_pca = pcafinal.transform(train_features)