In [1]:
# import packages
import warnings
import pandas as pd
import numpy as np
import sklearn
import pickle
from sklearn import pipeline
from sklearn import model_selection
from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import ensemble
from sklearn import neural_network

In [21]:
# toggle warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings('default')

In [12]:
features = pd.read_csv("../data/features.csv")
nn_features = pd.read_csv("../data/nn_features.csv")

In [13]:
# can take a while
raw_features = pd.read_csv("../data/raw_features.csv")

In [14]:
combined = pd.concat([features, nn_features], axis = 1)
combined_raw = pd.concat([features, raw_features], axis = 1)
combined['song_check'] = combined['title'] + ' - ' + combined['artist']
combined_raw['song_check'] = combined_raw['title'] + ' - ' + combined_raw['artist']

if combined['song'].equals(combined['song_check']):
    print ("Basic features match.")
    combined = combined.drop(columns = ['song', 'song_check'])
else:
    print ("Basic features DO NOT match.")
    
if combined_raw['song'].equals(combined_raw['song_check']):
    print ("Raw features match.")
    combined_raw = combined_raw.drop(columns = ['song', 'song_check'])
else:
    print ("Raw features DO NOT match.")

Basic features match.
Raw features match.


In [19]:
def build_xy(features, feature_type):
    if feature_type == "neural_net":
        # 9 is the index of zcr_mean, the first neural network feature
        x = features.iloc[:, 9:]
    elif feature_type == "raw":
        # 9 is the index of the first frame for raw features
        x = features.iloc[:, 9:]
    elif feature_type == "basic":
        x = features[['tempo', 'chroma_number', 'zero_crossing_rate', 'energy_entropy', 'spectral_centroid']]
    else:
        print ("Invalid feature type.")
        return (None, None)
    
    y = features[['primary', 'secondary']]
    y['combined'] = y['primary'].astype(str) + y['secondary'].astype(str)
    return (x, y)

In [16]:
def evaluate_mood(y_true, y_predict):
    true_list = y_true.tolist()
    score_values = []
    for index, value in enumerate(y_predict):
        true_moods = true_list[index]
        if str(value) == true_moods[0]:
            score_values.append(1)
        elif str(value) in list(true_moods):
            score_values.append(0.5)
        else:
            score_values.append(0)
    return pd.Series(score_values)

In [None]:
# code for using my custom mood scoring function
# predictions = model_selection.cross_val_predict(pipe, x, y['primary'], cv = 5)
# scores = evaluate_mood(y['combined'], predictions)
# print ("Accuracy of", round(np.mean(scores), 3), "on both primary and secondary moods")

In [35]:
# Support Vector Machine
x, y = build_xy(combined, "basic")
svm_model = svm.SVC()
scaler = preprocessing.MinMaxScaler()
pipe = pipeline.Pipeline(steps = [('scaler', scaler), ('svc', svm_model)])
param_grid = {
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__decision_function_shape': ['ovr', 'ovo'],
    'svc__C': [0.25, 0.5, 0.75, 1, 3, 5, 10, 25, 50],
    'svc__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10],
}
model = model_selection.GridSearchCV(pipe, param_grid, cv = 5)
accuracies = model_selection.cross_val_score(model, x, y['primary'], cv = 5)
print("Average accuracy:", np.mean(accuracies))

Average accuracy: 0.39


In [32]:
# final SVM model
x, y = build_xy(combined, "basic")
svm_model = svm.SVC()
scaler = preprocessing.MinMaxScaler()
pipe = pipeline.Pipeline(steps = [('scaler', scaler), ('svc', svm_model)])
param_grid = {
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__decision_function_shape': ['ovr', 'ovo'],
    'svc__C': [0.25, 0.5, 0.75, 1, 3, 5, 10, 25, 50],
    'svc__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10],
}
model = model_selection.GridSearchCV(pipe, param_grid, cv = 5)
model.fit(x, y['primary'])
print ("***SVM Best Parameters***")
print (model.best_params_)

***SVM Best Parameters***
{'svc__C': 0.75, 'svc__decision_function_shape': 'ovr', 'svc__gamma': 5, 'svc__kernel': 'rbf'}


In [44]:
# Logistic Regression
x, y = build_xy(combined, "basic")
lr_model = linear_model.LogisticRegression()
scaler = preprocessing.MinMaxScaler()
pipe = pipeline.Pipeline(steps = [('scaler', scaler), ('log_reg', lr_model)])
param_grid = {
    'log_reg__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'log_reg__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'log_reg__C': [0.25, 0.5, 0.75, 1, 3, 5, 10, 25, 50]
}
model = model_selection.GridSearchCV(pipe, param_grid, cv = 5)
accuracies = model_selection.cross_val_score(model, x, y['primary'], cv = 5)
print("Average accuracy:", np.mean(accuracies))

Average accuracy: 0.44000000000000006


In [45]:
# final LR model
x, y = build_xy(combined, "basic")
lr_model = linear_model.LogisticRegression()
scaler = preprocessing.MinMaxScaler()
pipe = pipeline.Pipeline(steps = [('scaler', scaler), ('log_reg', lr_model)])
param_grid = {
    'log_reg__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'log_reg__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'log_reg__C': [0.25, 0.5, 0.75, 1, 3, 5, 10, 25, 50]
}
model = model_selection.GridSearchCV(pipe, param_grid, cv = 5)
model.fit(x, y['primary'])
print ("***LR Best Parameters***")
print (model.best_params_)

***LR Best Parameters***
{'log_reg__C': 50, 'log_reg__penalty': 'l1', 'log_reg__solver': 'saga'}


In [46]:
# K Nearest Neighbor
x, y = build_xy(combined, "basic")
knn_model = neighbors.KNeighborsClassifier()
scaler = preprocessing.MinMaxScaler()
pipe = pipeline.Pipeline(steps = [('scaler', scaler), ('knn', knn_model)])
param_grid = {
    'knn__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski', 'wminkowski', 'seuclidean', 'mahalanobis'],
}
model = model_selection.GridSearchCV(pipe, param_grid, cv = 5)
accuracies = model_selection.cross_val_score(model, x, y['primary'], cv = 5)
print("Average accuracy:", np.mean(accuracies))

Average accuracy: 0.4


In [47]:
# final KNN model
x, y = build_xy(combined, "basic")
knn_model = neighbors.KNeighborsClassifier()
scaler = preprocessing.MinMaxScaler()
pipe = pipeline.Pipeline(steps = [('scaler', scaler), ('knn', knn_model)])
param_grid = {
    'knn__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski', 'wminkowski', 'seuclidean', 'mahalanobis'],
}
model = model_selection.GridSearchCV(pipe, param_grid, cv = 5)
model.fit(x, y['primary'])
print ("***KNN Best Parameters***")
print (model.best_params_)

***KNN Best Parameters***
{'knn__metric': 'euclidean', 'knn__n_neighbors': 9, 'knn__weights': 'distance'}


In [60]:
# Gaussian Naive Bayes
x, y = build_xy(combined, "basic")
nb_model = naive_bayes.GaussianNB()
param_grid = {
    'var_smoothing': [0.000000001, 0.00000001, 0.0000001, 0.000001],
}
model = model_selection.GridSearchCV(nb_model, param_grid, cv = 5)
accuracies = model_selection.cross_val_score(model, x, y['primary'], cv = 5)
print("Average accuracy:", np.mean(accuracies))

Average accuracy: 0.43


In [58]:
# final GNB model
x, y = build_xy(combined, "basic")
nb_model = naive_bayes.GaussianNB()
param_grid = {
    'var_smoothing': [0.000000001, 0.00000001, 0.0000001, 0.000001],
}
model = model_selection.GridSearchCV(nb_model, param_grid, cv = 5)
model.fit(x, y['primary'])
print ("***GNB Best Parameters***")
print (model.best_params_)

***GNB Best Parameters***
{'var_smoothing': 1e-07}


In [69]:
# Random Forest
x, y = build_xy(combined, "basic")
rf_model = ensemble.RandomForestClassifier()
param_grid = {
    'max_depth': [5, 10, 15, 20, 25, 50, 75, None],
    'min_samples_leaf': [1, 2, 5, 7, 10, 12, 15],
    'max_features': ["sqrt", "log2"]
}
model = model_selection.GridSearchCV(rf_model, param_grid, cv = 5)
accuracies = model_selection.cross_val_score(model, x, y['primary'], cv = 5)
print("Average accuracy:", np.mean(accuracies))

Average accuracy: 0.4


In [68]:
# final RF model
x, y = build_xy(combined, "basic")
rf_model = ensemble.RandomForestClassifier()
param_grid = {
    'max_depth': [5, 10, 15, 20, 25, 50, 75, None],
    'min_samples_leaf': [1, 2, 5, 7, 10, 12, 15],
    'max_features': ["sqrt", "log2"]
}
model = model_selection.GridSearchCV(rf_model, param_grid, cv = 5)
model.fit(x, y['primary'])
print ("***RF Best Parameters***")
print (model.best_params_)

***RF Best Parameters***
{'max_depth': 25, 'max_features': 'sqrt', 'min_samples_leaf': 7}


In [26]:
# Multi-Layer Perceptron with basic features
x, y = build_xy(combined, "neural_net")
mlp_model = neural_network.MLPClassifier()
scaler = preprocessing.MinMaxScaler()
pipe = pipeline.Pipeline(steps = [('scaler', scaler), ('mlp', mlp_model)])
param_grid = {
    'mlp__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,)],
    'mlp__activation': ['logistic', 'tanh', 'relu', 'identity'],
    'mlp__solver': ['lbfgs', 'sgd', 'adam'],
    'mlp__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],
}
model = model_selection.GridSearchCV(pipe, param_grid, cv = 5)
accuracies = model_selection.cross_val_score(model, x, y['primary'], cv = 5)
print("Average accuracy:", np.mean(accuracies))

Average accuracy: 0.5650000000000001


In [27]:
# final MLP model with basic features
x, y = build_xy(combined, "neural_net")
mlp_model = neural_network.MLPClassifier()
scaler = preprocessing.MinMaxScaler()
pipe = pipeline.Pipeline(steps = [('scaler', scaler), ('mlp', mlp_model)])
param_grid = {
    'mlp__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,)],
    'mlp__activation': ['logistic', 'tanh', 'relu', 'identity'],
    'mlp__solver': ['lbfgs', 'sgd', 'adam'],
    'mlp__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],
}
model = model_selection.GridSearchCV(pipe, param_grid, cv = 5)
model.fit(x, y['primary'])
print ("***MLP Best Parameters***")
print (model.best_params_)

***MLP Best Parameters***
{'mlp__activation': 'logistic', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': (60,), 'mlp__solver': 'adam'}


In [22]:
# Multi-Layer Perceptron with raw features
x, y = build_xy(combined_raw, "raw")
mlp_model = neural_network.MLPClassifier()
scaler = preprocessing.MinMaxScaler()
pipe = pipeline.Pipeline(steps = [('scaler', scaler), ('mlp', mlp_model)])
param_grid = {
    'mlp__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,)],
    'mlp__activation': ['logistic', 'tanh', 'relu', 'identity'],
    'mlp__solver': ['lbfgs', 'sgd', 'adam'],
    'mlp__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],
}
model = model_selection.GridSearchCV(pipe, param_grid, cv = 5)
accuracies = model_selection.cross_val_score(model, x, y['primary'], cv = 5)
print("Average accuracy:", np.mean(accuracies))

KeyboardInterrupt: 

In [None]:
# final MLP model with raw features
x, y = build_xy(combined_raw, "raw")
mlp_model = neural_network.MLPClassifier()
scaler = preprocessing.MinMaxScaler()
pipe = pipeline.Pipeline(steps = [('scaler', scaler), ('mlp', mlp_model)])
param_grid = {
    'mlp__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,)],
    'mlp__activation': ['logistic', 'tanh', 'relu', 'identity'],
    'mlp__solver': ['lbfgs', 'sgd', 'adam'],
    'mlp__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],
}
model = model_selection.GridSearchCV(pipe, param_grid, cv = 5)
model.fit(x, y['primary'])
print ("***MLP Best Parameters***")
print (model.best_params_)

In [None]:
# ensemble/stacking

In [None]:
# use pickle to store the final model
stacked_model = None
filename = 'mood_model.sav'
pickle.dump(stacked_model, open(filename, 'wb'))