In [1]:
import numpy as np
import mysql.connector as dbc
from pandas import DataFrame
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier

# Model Training, Validation, and Selection

In this notebook we are going to cover the model selection processed used to select the best possible model for our genre classifier. We will be using off-the-shelf models available through Scikit-Learn, all models that do not currently support multi-target output natively will be used in combination with the MultiOutputRegressor meta-estimator. Note that despite calling our problem 'classification,' we will be considering regression as well by treating 0's and 1's as floating point targets.

Models tested:

    Multi-layer Perceptron
    
    Gradient Boosting
    
    Ada Boosting
    
    Support Vector Machines
    
    Gaussian Processes

# Load In and Format the Data

First we must load the data in from our local database. The database was created in MySQL Workbench, so we will be using the MySQL database connector. Please ignore the login information, our database is local and very similar to a teenager: horribly insecure and easily breakable.

In [2]:
db = dbc.connect(port=3306,
                 user="root",
                 passwd="password",
                 db="SONG")
cursor = db.cursor()

query = "select * from SONG.FEATURES natural join SONG.GENRES"
cursor.execute(query)

scalar = StandardScaler()
data = []

# Data is far too big to call fetchall()
for row in cursor: 
    data.append(row)

In [3]:
frame = DataFrame(data)
frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,80339,-164.711,114.635,-11.1596,16.9962,-39.1011,-14.3726,-11.561,-5.73267,-11.2177,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,52932,-194.625,161.465,-61.2707,32.1586,-17.0603,13.2025,-11.3829,4.30925,-2.09328,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,9354,85.7364,78.0495,6.39704,16.2382,8.94914,11.8529,3.28284,7.53779,-0.827754,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,40760,-268.904,151.385,-6.96333,10.9466,2.61376,-5.20309,10.5765,-1.63976,-8.4307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,24190,-305.507,192.218,4.84985,43.6381,22.048,-6.72821,11.7671,0.295088,-20.1753,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


So as we can see in the table above, the right-most 16 columns are the target genres, the middle 10 columns are the MFCCs, and the left-most column is the song id. We can use numpy arrays to easily break these apart for model training.

In [4]:
X = np.array(data)[:,1:11]
y = np.array(data)[:,11:27]

x_frame = DataFrame(X)
y_frame = DataFrame(y)

In [5]:
x_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-164.711,114.635,-11.1596,16.9962,-39.1011,-14.3726,-11.561,-5.73267,-11.2177,-4.23954
1,-194.625,161.465,-61.2707,32.1586,-17.0603,13.2025,-11.3829,4.30925,-2.09328,-4.11646
2,85.7364,78.0495,6.39704,16.2382,8.94914,11.8529,3.28284,7.53779,-0.827754,5.72616
3,-268.904,151.385,-6.96333,10.9466,2.61376,-5.20309,10.5765,-1.63976,-8.4307,-4.02119
4,-305.507,192.218,4.84985,43.6381,22.048,-6.72821,11.7671,0.295088,-20.1753,0.290169


In [6]:
y_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Pick a Model

Now that the data is in the proper format we can begin training models and checking their accuracy. For this we will be using KFold cross validation on each of the models mentioned above, then averaging the scores for each model.

In [8]:
kf = KFold()

mlpc = MLPClassifier()
mlpr = MLPRegressor()
gbc = MultiOutputClassifier(GradientBoostingClassifier())
abc = MultiOutputClassifier(AdaBoostClassifier())
lin = MultiOutputRegressor(LinearRegression())
log = MultiOutputClassifier(LogisticRegression())
et = MultiOutputClassifier(ExtraTreesClassifier())

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    print("MLPC: {}".format(mlpc.fit(X_train, y_train).score(X_test, y_test)))
    print("MLPR: {}".format(mlpr.fit(X_train, y_train).score(X_test, y_test)))
    print("GBC: {}".format(gbc.fit(X_train, y_train).score(X_test, y_test)))
    print("ABC: {}".format(abc.fit(X_train, y_train).score(X_test, y_test)))
    print("LIN: {}".format(lin.fit(X_train, y_train).score(X_test, y_test)))
    print("LOG: {}".format(log.fit(X_train, y_train).score(X_test, y_test)))
    print("ET: {}".format(et.fit(X_train, y_train).score(X_test, y_test)))
    print("--------")

MLPC: 0.11094452773613193




MLPR: -1.9183070568189402
GBC: 0.09895052473763119
ABC: 0.10644677661169415
LIN: 0.008935176312173332
LOG: 0.08245877061469266
ET: 0.095952023988006
--------
MLPC: 0.08695652173913043




MLPR: -2.6683399829899304
GBC: 0.08245877061469266
ABC: 0.08395802098950525
LIN: 0.017935322149826766
LOG: 0.05997001499250375
ET: 0.0734632683658171
--------
MLPC: 0.0990990990990991




MLPR: -2.0212493868059664
GBC: 0.08708708708708708
ABC: 0.08858858858858859
LIN: 0.015328108588162012
LOG: 0.08108108108108109
ET: 0.07357357357357357
--------


Well, that's deeply dissapointing. None of our models reached an acceptably high accuracy, some even did worse than random chance. I'll have to look into other options and then come back to this.

# Do It Live

After reading some papers, I think I may be dealing with a non-linear system. As such, we may be able to develop a deep learning solution to our classification problem. I will cover this process in the next notebook: model_training.ipynb.