# Imports

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from scipy import stats

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA, KernelPCA

from sklearn.svm import SVC

In [12]:
# does not contain predicted mean_A and mean_V values:
# df = pd.read_csv("dataset/phase_3_TRAIN_7d499bff69ca69b6_6372c3e_MLPC2021_generic.csv")

# contains our predicted meanA and meanV:
df = pd.read_csv("dataset/dataset_with_predicted_mean_A_mean_V.csv")


# Split dataset into inputs and targets

In [13]:
# drop target value, student annotations and string ID from input features:
X = df.drop(columns=['quadrant','mean_A','mean_V','id','score_mode','score_key_strength', 'predicted_quadrant'])

# we want to predict the quadrant:
y = df['quadrant'].values

In [14]:
X.head()

Unnamed: 0,essentia_dissonance_mean,essentia_dissonance_stdev,essentia_dynamic_complexity,essentia_loudness,essentia_onset_rate,essentia_pitch_salience_mean,essentia_pitch_salience_stdev,essentia_spectral_centroid_mean,essentia_spectral_centroid_stdev,essentia_spectral_complexity_mean,...,librosa_spectral_flatness_stdev,midlevel_features_melody,midlevel_features_articulation,midlevel_features_rhythm_complexity,midlevel_features_rhythm_stability,midlevel_features_dissonance,midlevel_features_tonal_stability,midlevel_features_minorness,predicted_mean_A,predicted_mean_V
0,0.206445,0.131383,13.998791,58.329521,3.1,0.539736,0.193987,3367.790527,3684.124268,2.986095,...,0.248388,0.065397,-0.02938,-0.303987,-0.053306,-0.325228,0.347745,-0.119278,45.216172,1.749838
1,0.145753,0.057971,3.533264,75.166183,4.2,0.476832,0.123564,1395.528809,401.98761,4.181923,...,0.000484,0.35304,-0.073957,-0.381182,0.12075,-0.479266,0.451572,-0.064252,42.078254,0.360194
2,0.149111,0.059011,3.337368,82.753929,4.3,0.457741,0.132359,1318.759644,289.306152,4.303592,...,0.000314,0.201885,0.016944,-0.337422,0.103853,-0.388214,0.430684,-0.226197,48.451175,1.963364
3,0.163914,0.057474,3.078172,79.024742,4.1,0.504427,0.121134,1258.258423,262.56131,5.271147,...,0.000257,0.181998,-0.019165,-0.314282,0.118872,-0.335902,0.314371,-0.006619,45.000775,1.033474
4,0.157382,0.054116,2.682208,59.633064,3.9,0.503377,0.121658,1244.375122,260.756195,4.122828,...,0.000241,0.256362,-0.151648,-0.302913,-0.00294,-0.395495,0.368037,-0.162711,31.551966,0.747443


In [15]:
#split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# Simple SVM (no normalization)

In [16]:
# Create Support Vector Machine Classifier:
clf = SVC()

# Fit the classifier to the data
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

0.5287356321839081

# Normalizing data drastically improves performance for SVM

In [17]:
X_normalized = MinMaxScaler().fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=1, stratify=y)

In [18]:
# Fit the classifier to the normalized data
clf.fit(X_train,y_train)

clf.score(X_test, y_test)

0.7835249042145593

In [22]:
clf.get_params().keys()

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

# SVM with Cross-Validation on Normalized Data with (optional) Feature Selection

In [None]:
pipeline = Pipeline([
        ("normalizer", MinMaxScaler()),
        ("selector", SelectKBest()),
        ('svm', SVC())
      ])

param_grid = {
    'selector__k': [4, 8, 15, "all"],
    'svm__C': [0.1, 1, 10, 100, 1000], 
    'svm__gamma': [10,1,0.1,0.01,0.001], 
    'svm__kernel': ['rbf', 'poly', 'linear', 'sigmoid']
}


grid = GridSearchCV(pipeline, param_grid, cv=5, return_train_score=True, n_jobs=-1, verbose=4)
#grid = RandomizedSearchCV(pipeline, param_grid, cv=5, return_train_score=True, n_jobs=-1, verbose=4, n_iter=1000)

grid.fit(X, y)

scores_all_features = grid.cv_results_["mean_test_score"]

print("\n"+"#"*50,"\nbest estimator: ", grid.best_estimator_,"\n"+"#"*50,"\nbest params: ", grid.best_params_, "\n"+"#"*50,"\nbest score: ",grid.best_score_, "\n")


Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 542 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 765 tasks      | elapsed:  6.8min


# Normalization => PCA => SVM

In [None]:
pipeline = Pipeline([
        ("normalizer", StandardScaler()),
        ("pca", PCA()),
        ('svm', SVC())
      ])

param_grid = {
    'pca__n_components': [.5,.75,.9,None],
    'svm__C': [0.1, 1, 10, 100, 1000], 
    'svm__gamma': [10,1,0.1,0.01,0.001], 
    'svm__kernel': ['rbf', 'poly', 'linear', 'sigmoid']
}


grid_pca = GridSearchCV(pipeline, param_grid, cv=5, return_train_score=True, n_jobs=-1, verbose=4)
#grid = RandomizedSearchCV(pipeline, param_grid, cv=5, return_train_score=True, n_jobs=-1, verbose=4, n_iter=1000)

grid_pca.fit(X, y)

scores_all_features = grid_pca.cv_results_["mean_test_score"]

print("\n"+"#"*50,"\nbest estimator: ", grid_pca.best_estimator_,"\n"+"#"*50,"\nbest params: ", grid_pca.best_params_, "\n"+"#"*50,"\nbest score: ",grid_pca.best_score_, "\n")


# Evaluation

In [32]:
def plot_search_results(grid, title="Score per parameter", log_list=[]):
    """
    Params: 
        grid: A trained GridSearchCV object.
    """
    ## Results from grid search
    results = grid.cv_results_
    means_test = results['mean_test_score']
    stds_test = results['std_test_score']
    means_train = results['mean_train_score']
    stds_train = results['std_train_score']

    ## Getting indexes of values per hyper-parameter
    masks=[]
    masks_names= list(grid.best_params_.keys())
    for p_k, p_v in grid.best_params_.items():
        masks.append(list(results['param_'+p_k].data==p_v))

    params=grid.param_grid

    ## Ploting results
    fig, ax = plt.subplots(1,len(params),sharex='none', sharey='all',figsize=(20,5))
    fig.suptitle(title, fontsize=26)
    fig.text(0.04, 0.5, 'MEAN SCORE', va='center', rotation='vertical', fontsize=18)
    pram_preformace_in_best = {}
    for i, p in enumerate(masks_names):
        m = np.stack(masks[:i] + masks[i+1:])
        pram_preformace_in_best
        best_parms_mask = m.all(axis=0)
        best_index = np.where(best_parms_mask)[0]
        x = np.array(params[p])
        y_1 = np.array(means_test[best_index])
        e_1 = np.array(stds_test[best_index])
        y_2 = np.array(means_train[best_index])
        e_2 = np.array(stds_train[best_index])
        ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='test')
        ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
        ax[i].set_xlabel(p.upper(), fontsize=18)
        if p in log_list: 
            ax[i].set_xscale('log')


    plt.legend()
    plt.show()



In [None]:
plot_search_results(grid, title="Effects of Parameter Choice on Mean Test Score for SVM",log_list=["svm__C"])

In [None]:
plot_search_results(grid_pca, title="Effects of Parameter Choice on Mean Test Score for SVM",log_list=["svm__C"])