## Machine Learning Engineering Career Track Capstone
    
### Step 6: Experiment With Various Models

The purpose of this step is for you to rigorously test how to build the best model for analyzing the patterns found in your dataset. 
Perform some of the following activities:
- Build an automated process to test many modeling techniques and ML algorithms with your data to see which one yields the best results
- Define the performance metric(s) best applied to your problem (accuracy, F1, RSME, LOC, etc.)
- Test various loss functions across models to see which one yields the best result
- Perform tuning of one or more model, across one or multiple hyperparameters
- Build a robust cross-validation process for your problem
- Ensemble multiple models together, and demonstrate the superior results
- Analyze the prediction results to confirm how some of your models ended up properly generalizing or overfitting the data
- Present your best model(s)

In [85]:
import os
import pickle
import sys
from random import sample
from time import time

from IPython.display import Audio, display
import librosa
from librosa.display import waveplot
import moviepy.editor as mp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, silhouette_score
from sklearn.preprocessing import scale, StandardScaler

# models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier

In [86]:
train_df = pd.read_pickle('data/features/train/labels.pkl')
dev_df = pd.read_pickle('data/features/dev/labels.pkl')
test_df = pd.read_pickle('data/features/test/labels.pkl')

datasets = (train_df, dev_df, test_df)

y_train = train_df['emotion_class']
y_dev = dev_df['emotion_class']
y_test = test_df['emotion_class']
print(y_train.shape, y_dev.shape, y_test.shape)

(9986,) (1108,) (2610,)


In [87]:
def load_feats(stage: str):
    outpath = 'data/features/{}/{}.pkl'
    feat_fnames = ['mel', 'mfcc', 'chroma']
    tmp = [pickle.load(open(outpath.format(stage, fn), 'rb')) for fn in feat_fnames]
    return np.hstack(tmp)

X_train, X_dev, X_test = load_feats('train'), load_feats('dev'), load_feats('test')
print(X_train.shape, X_dev.shape, X_test.shape)

(9986, 160) (1108, 160) (2610, 160)


## Experiment: All data, RandomForest

In [88]:
# run PCA to get top features, collect as X_train and get y_train

model = RandomForestClassifier(class_weight='balanced', random_state=0, n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_dev)
print('\n', classification_report(y_dev, y_pred))


               precision    recall  f1-score   support

           0       0.00      0.00      0.00       153
           1       0.00      0.00      0.00        22
           2       0.00      0.00      0.00        40
           3       0.00      0.00      0.00       163
           4       0.42      1.00      0.60       469
           5       0.00      0.00      0.00       111
           6       0.00      0.00      0.00       150

    accuracy                           0.42      1108
   macro avg       0.06      0.14      0.09      1108
weighted avg       0.18      0.42      0.25      1108



In [89]:
classifiers = [
#     MultinomialNB(),  # ValueError: Negative values in data passed to MultinomialNB (input X)
    GaussianNB(),
    DecisionTreeClassifier(random_state=0, class_weight='balanced'),
    RandomForestClassifier(n_jobs=-1, random_state=0, class_weight='balanced'),
    KNeighborsClassifier(n_jobs=-1),
    LinearSVC(multi_class='ovr', class_weight='balanced', random_state=0),
    # TODO increase max_iter
    LogisticRegression(multi_class='multinomial', class_weight='balanced', random_state=0, n_jobs=-1),  
    LogisticRegression(multi_class='ovr', class_weight='balanced', random_state=0, n_jobs=-1),   
    MLPClassifier(random_state=0),
    RidgeClassifier(class_weight='balanced', random_state=0)
]


In [57]:
import time

predictions = dict()
for model in classifiers:
    start = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_dev)
    elapsed = time.time() - start
    name = model.__class__.__name__
    predictions[name] = y_pred
    print(name, ' in ', round(elapsed, 2),)

GaussianNB  in  0.03
DecisionTreeClassifier  in  2.19
RandomForestClassifier  in  2.19
KNeighborsClassifier  in  0.32
LinearSVC  in  36.18
LogisticRegression  in  1.99
LogisticRegression  in  4.1
MLPClassifier  in  2.51
RidgeClassifier  in  0.02
done


In [74]:
tmp = [(name, round(f1_score(y_dev, y_pred, average='weighted'), 3)) for name, y_pred in predictions.items()]
tmp.sort(key=lambda x: x[1], reverse=True)
pd.DataFrame(tmp, columns=["Model", "F1 weighted"])

Unnamed: 0,Model,F1 weighted
0,KNeighborsClassifier,0.273
1,DecisionTreeClassifier,0.254
2,MLPClassifier,0.254
3,RandomForestClassifier,0.252
4,LogisticRegression,0.166
5,RidgeClassifier,0.115
6,GaussianNB,0.072
7,LinearSVC,0.04


In [75]:
for name, y_pred in predictions.items():
    report = classification_report(y_dev, y_pred)
    print(name)
    print(report)


GaussianNB
              precision    recall  f1-score   support

           0       0.09      0.04      0.05       153
           1       0.02      0.05      0.03        22
           2       0.04      0.82      0.08        40
           3       0.10      0.02      0.04       163
           4       0.45      0.05      0.09       469
           5       0.09      0.03      0.04       111
           6       0.12      0.07      0.09       150

    accuracy                           0.07      1108
   macro avg       0.13      0.15      0.06      1108
weighted avg       0.25      0.07      0.07      1108

DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.10      0.08      0.09       153
           1       0.02      0.05      0.03        22
           2       0.09      0.07      0.08        40
           3       0.17      0.23      0.20       163
           4       0.44      0.45      0.45       469
           5       0.05      0.03      0.03 

## Custom Transformer

In [91]:
from sklearn.base import BaseEstimator, TransformerMixin


class ExtractTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X


## Pipeline PoC


In [100]:
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from tempfile import mkdtemp
from shutil import rmtree


extract = ExtractTransformer()
pca = PCA()
logistic = LogisticRegression(max_iter=10000, tol=0.1)
cachedir = mkdtemp()

pipe = Pipeline(steps=[('extract', extract), ('pca', pca), ('logistic', logistic)], memory=cachedir)

param_grid = {
    'pca__n_components': [5, 10],
    'logistic__C': np.logspace(-4, 4, 4),
}
search = GridSearchCV(pipe, param_grid)  #  n_jobs=-1  error?

search.fit(X_train, y_train)
rmtree(cachedir)  # TODO convert into context manager

print('Best parameter (CV score=%0.3f):' % search.best_score_)
print(search.best_params_)


model = search.best_estimator_

[Pipeline] ........... (step 1 of 3) Processing extract, total=   0.0s
[Pipeline] ............... (step 2 of 3) Processing pca, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing logistic, total=   0.6s
[Pipeline] ........... (step 1 of 3) Processing extract, total=   0.0s
[Pipeline] ............... (step 2 of 3) Processing pca, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing logistic, total=   0.8s
[Pipeline] ........... (step 1 of 3) Processing extract, total=   0.0s
[Pipeline] ............... (step 2 of 3) Processing pca, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing logistic, total=   0.9s
[Pipeline] ........... (step 1 of 3) Processing extract, total=   0.0s
[Pipeline] ............... (step 2 of 3) Processing pca, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing logistic, total=   0.9s
[Pipeline] ........... (step 1 of 3) Processing extract, total=   0.0s
[Pipeline] ............... (step 2 of 3) Processing pca, total=   0.0s
[Pipel

In [93]:
y_dev_pred = model.predict(X_dev)
f1_score(y_dev, y_dev_pred, average='weighted')

0.2517701434657498

### Optimizing KNeighborsClassifier

### Optimizing MLPClassifier

### Optimizing RandomForestClassifier