In [2]:
import numpy as np
import mysql.connector as dbc
from pandas import DataFrame
from sklearn.model_selection import KFold
from sklearn.svm import SVC, SVR
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

# Model Training, Validation, and Selection

In this notebook we are going to cover the model selection processed used to select the best possible model for our genre classifier. We will be using off-the-shelf models available through Scikit-Learn, all models that do not currently support multi-target output natively will be used in combination with the MultiOutputRegressor meta-estimator. Note that despite calling our problem 'classification,' we will be considering regression as well by treating 0's and 1's as floating point targets.

Models tested:

    Multi-layer Perceptron
    
    Gradient Boosting
    
    Ada Boosting
    
    Support Vector Machines
    
    Gaussian Processes

# Load In and Format the Data

First we must load the data in from our local database. The database was created in MySQL Workbench, so we will be using the MySQL database connector. Please ignore the login information, our database is local and very similar to a teenager: horribly insecure and easily breakable.

In [3]:
db = dbc.connect(port=3306,
                 user="root",
                 passwd="password",
                 db="SONG")
cursor = db.cursor()

query = "select * from SONG.FEATURES natural join SONG.GENRES"
cursor.execute(query)

data = []

# Data is far too big to call fetchall()
for row in cursor: 
    data.append(row)

In [4]:
frame = DataFrame(data)
frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,5052,-515.846,1.76513,-6.10513,0.538926,-3.27525,-2.62195,-2.70605,-3.74004,0.505801,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,5052,-60.3773,194.726,-68.8474,49.2787,13.2983,1.62496,-0.189355,16.3654,-7.21603,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,5052,-53.7426,193.482,-81.1924,46.3096,20.1474,19.2474,-3.89202,11.2145,-7.56853,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,5052,-50.4798,198.534,-79.677,42.5773,14.6366,7.98548,-6.82649,10.3821,-6.12184,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5052,-82.8048,183.978,-75.459,53.1861,20.628,3.9369,-6.77132,9.3908,0.048497,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


So as we can see in the table above, the right-most 16 columns are the target genres, the middle 10 columns are the MFCCs, and the left-most column is the song id. We can use numpy arrays to easily break these apart for model training.

In [5]:
X = np.array(data)[:,1:11]
y = np.array(data)[:,11:27]

x_frame = DataFrame(X)
y_frame = DataFrame(y)

In [6]:
x_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-515.846,1.76513,-6.10513,0.538926,-3.27525,-2.62195,-2.70605,-3.74004,0.505801,2.55935
1,-60.3773,194.726,-68.8474,49.2787,13.2983,1.62496,-0.189355,16.3654,-7.21603,-0.07875
2,-53.7426,193.482,-81.1924,46.3096,20.1474,19.2474,-3.89202,11.2145,-7.56853,3.30256
3,-50.4798,198.534,-79.677,42.5773,14.6366,7.98548,-6.82649,10.3821,-6.12184,6.21249
4,-82.8048,183.978,-75.459,53.1861,20.628,3.9369,-6.77132,9.3908,0.048497,7.43601


In [7]:
y_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Pick a Model

Now that the data is in the proper format we can begin training models and checking their accuracy. For this we will be using KFold cross validation on each of the models mentioned above, then averaging the scores for each model.

In [9]:
kf = KFold()

mlpc = MLPClassifier()
mlpr = MLPRegressor()
svc = MultiOutputClassifier(SVC())
svr = MultiOutputRegressor(SVR())
gbc = MultiOutputClassifier(GradientBoostingClassifier())
abc = MultiOutputClassifier(AdaBoostClassifier())

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    #print("MLPC: {}".format(mlpc.fit(X_train, y_train).score(X_test, y_test)))
    #print("MLPR: {}".format(mlpr.fit(X_train, y_train).score(X_test, y_test)))
    #print("SVC: {}".format(svc.fit(X_train, y_train).score(X_test, y_test)))
    #print("SVR: {}".format(svr.fit(X_train, y_train).score(X_test, y_test)))
    print("GBC: {}".format(gbc.fit(X_train, y_train).score(X_test, y_test)))
    print("ABC: {}".format(abc.fit(X_train, y_train).score(X_test, y_test)))
    print("--------")

GBC: 0.08540951096563625
ABC: 0.07841126462455544
--------


KeyboardInterrupt: 