In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score

In [174]:
df = pd.read_pickle('song_comps.pkl')

In [175]:
df = df.dropna()

In [176]:
binary_vars = ['mode', 'genre', 'Non-Standard Time Signature']

In [177]:
def normalize(data):
    if data.name in binary_vars:
        return data
    return (data - data.mean()) / data.std()
df = df.apply(normalize)

In [178]:
org_cols = df.columns
for col1 in org_cols:
    for col2 in org_cols:
        if col1 not in binary_vars and col2 not in binary_vars:
            if col1 != col2:
                sort_cols = sorted((col1, col2))
                col_name = sort_cols[0] + '_' + sort_cols[1]
                df[col_name] = df[col1] * df[col2]
            else:
                df[col1 + '_squared'] = df[col1]**2
                df[col1 + '_signsquared'] = (df[col1]**2) * np.sign(df[col1])
                df[col1 + '_sqrt'] = np.sqrt(np.abs(df[col1]))
                df[col1 + '_signsqrt'] = np.sqrt(np.abs(df[col1])) * np.sign(df[col1])

In [180]:
df.to_pickle('song_comps_added_features.pkl')

---

In [182]:
df = pd.read_pickle('song_comps_added_features.pkl')

In [183]:
X = df.drop('genre', axis=1).values
Y = df.genre.values

In [184]:
ind = np.random.permutation(len(Y))

In [185]:
# sub_X = X[ind[:25000], :]
# sub_Y = Y[ind[:25000]]
sub_X = X[ind[:5000], :]
sub_Y = Y[ind[:5000]]

In [186]:
# test_X = X[ind[25000:30000], :]
# test_Y = Y[ind[25000:30000]]
test_X = X[ind[5000:5500], :]
test_Y = Y[ind[5000:5500]]

In [187]:
1-test_Y.mean()

0.80800000000000005

In [97]:
clf = SVC(C=10)
res = cross_val_score(clf, sub_X, sub_Y, cv=5)
print "%.2f ± %.2f" % (res.mean(), res.std()*2)

0.86 ± 0.02


SVM Hyperparameter Tuning

In [27]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'kernel': ['rbf']},
  {'C': [1, 10, 100, 1000], 'kernel': ['poly']}
 ]
svc = SVC()
grid_search = GridSearchCV(svc, param_grid)
grid_search.fit(sub_X, sub_Y)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid=[{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}, {'kernel': ['rbf'], 'C': [1, 10, 100, 1000]}, {'kernel': ['poly'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [34]:
grid_search.best_params_

{'C': 10, 'kernel': 'rbf'}

In [60]:
pred_Y = grid_search.predict(test_X)

In [62]:
precision_score(test_Y, pred_Y)

0.81052631578947365

In [63]:
recall_score(test_Y, pred_Y)

0.80208333333333337

In [64]:
f1_score(test_Y, pred_Y)

0.80628272251308897

Random Forest Hyperparameter Tuning

In [20]:
param_grid = {'max_features': ['sqrt', 'log2'], 'n_estimators': [10, 50, 100, 150]}
rf = RandomForestClassifier()
grid_search2 = GridSearchCV(rf, param_grid)
grid_search2.fit(sub_X, sub_Y)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'max_features': ['sqrt', 'log2'], 'n_estimators': [10, 50, 100, 150]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [21]:
grid_search2.best_params_

{'max_features': 'sqrt', 'n_estimators': 100}

In [22]:
grid_search2.score(test_X, test_Y)

0.876

In [23]:
pred_Y = grid_search2.predict(test_X)

In [24]:
precision_score(test_Y, pred_Y)

0.8936170212765957

In [25]:
recall_score(test_Y, pred_Y)

0.42424242424242425

In [26]:
f1_score(test_Y, pred_Y)

0.57534246575342463

---
Based on the precision/recall score results, it seems like the Random Forest is more picky with what it classifies as a genre match and, thus, is less likely to produce false positives.  As a result, though, it does not pick up every possible match and leaves a lot of false negatives.  It's possible to balance between precision and recall by adjusting the decision threshold, but it seems like the SVM will perform better than any tuning of a Random Forest based on the F1 scores.  The SVM only has slightly worse precision and vastly superior recall.  Based on these results from training on a small training set, I decided to go with an SVM to train on a larger set.

In [3]:
rbf_svm = joblib.load('rbf_svm.pkl')
test_X = np.load('test_X.npy')
test_Y = np.load('test_Y.npy')

In [4]:
rbf_svm.score(test_X[:500,:], test_Y[:500])

0.93400000000000005

In [5]:
pred_Y = rbf_svm.predict(test_X[:500,:])

In [8]:
precision_score(test_Y[:500], pred_Y)

0.86868686868686873

In [9]:
recall_score(test_Y[:500], pred_Y)

0.81132075471698117

In [11]:
f1_score(test_Y[:500], pred_Y)

0.83902439024390252