In [167]:
import pandas as pd
import numpy as np
import scipy
import sys, os, pickle
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential

#classifier models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import SVC


#metrics
from sklearn.metrics import accuracy_score, log_loss, roc_curve, auc

In [168]:
df = pd.read_csv(r'./data/features_3_sec.csv')
df = df.drop(labels='filename', axis=1)

In [172]:
y = LabelEncoder().fit_transform(df.iloc[:,-1])
X = StandardScaler().fit_transform(np.array(df.iloc[:,:-1], dtype=float))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Deep Learning model

In [40]:
def trainModel(model, epochs, optimizer):
    batch_size = 128
    #callback = myCallback()
    model.compile(optimizer=optimizer,
                  loss="sparse_categorical_crossentropy",
                  metrics=['accuracy'])
    return model.fit(X_train, y_train, validation_data=(X_test, y_test),
                     epochs=epochs, batch_size=batch_size)

def plotValidate(history):
    print("Validation Accuracy", max(history.history['val_accuracy']))
    pd.DataFrame(history.history).plot(figsize=(12,6))
    plt.show()

In [41]:
units, rate = 64, 0.2
model = keras.models.Sequential([
    keras.layers.Dense(units*2**3, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(rate),
    
    keras.layers.Dense(units*2**2, activation='relu'),
    keras.layers.Dropout(rate),
    
    keras.layers.Dense(units*2**1, activation='relu'),
    keras.layers.Dropout(rate),
    
    keras.layers.Dense(units, activation='relu'),
    keras.layers.Dropout(rate),
    
    keras.layers.Dense(10, activation='softmax'),
])

In [42]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 512)               30208     
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 64)               

In [43]:
model_history = trainModel(model=model, epochs=600, optimizer='adam')

Train on 6693 samples, validate on 3297 samples
Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600

In [44]:
test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=128)



In [45]:
print('The test loss is: ', test_loss)
print('The Best test Accuracy is: ', test_acc*100)

The test loss is:  0.4977346794656598
The Best test Accuracy is:  92.44768023490906


In [46]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x1677081be88>

# Other ML models

In [47]:
categorylist = list(df.select_dtypes(['object']))
tempdf = pd.get_dummies(df, columns=categorylist)
label_columns = [x for x in tempdf.columns if x.startswith('label')]
X = tempdf.drop(label_columns, axis=1)
y = tempdf[label_columns].copy()

In [122]:
encoder = LabelEncoder()
categorylist = list(df.select_dtypes(['object']))
y = df[categorylist]
y_encoded = encoder.fit_transform(y)

  return f(*args, **kwargs)


In [47]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [148]:
def from_multilabel_to_onehot(array, column='label', return_columns=False):
    df = pd.DataFrame(y_test, columns=[column])
    df = pd.get_dummies(df, columns=[column])
    
    if return_columns:
        return [x.replace('label_','') for x in df.columns]
    return df.to_numpy()

In [48]:
num_attribs = list(X.select_dtypes(['int64', 'float64']))
pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

In [49]:
X_prepared = pipeline.fit_transform(X)
# y_prepared = y.to_numpy()[:,1]
y_prepared = y.to_numpy()

In [50]:
sss = StratifiedShuffleSplit(n_splits=10, test_size= 0.33, random_state=0)

classifiers = [
    KNeighborsClassifier(),
    ExtraTreeClassifier(),
    DecisionTreeClassifier(),
]

In [51]:
res = []
labels = [x.replace('label_','') for x in y.columns]
for train_idx, test_idx in sss.split(X_prepared, y_prepared):
    X_train, X_test = X_prepared[train_idx], X_prepared[test_idx]
    y_train, y_test = y_prepared[train_idx], y_prepared[test_idx]
    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_pred_probas = clf.predict_proba(X_test)
        acc = accuracy_score(y_test, y_pred)
        loss = log_loss(y_test, y_pred)
        
        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i, label in enumerate(labels):
            fpr[label], tpr[label], _ = roc_curve(y_test[:,i], y_pred_probas[i][:,1])
            roc_auc[label] = auc(fpr[label], tpr[label])
            res.append([name, label, acc, loss, roc_auc[label]])

In [52]:
res_svc = []
clf = SVC(probability=True)
for fold_idx, (train_idx, test_idx) in enumerate(sss.split(X_prepared, y_encoded)):
    print(fold_idx)
    X_train, X_test = X_prepared[train_idx], X_prepared[test_idx]
    y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]
    name = clf.__class__.__name__
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred_probas = clf.predict_proba(X_test)
    acc = accuracy_score(y_test, y_pred)
    loss = log_loss(y_test, y_pred_probas)
    
    labels = from_multilabel_to_onehot(y_test, return_columns=True)
    y_test = from_multilabel_to_onehot(y_test)
    # y_pred_probas = from_multilabel_to_onehot(y_pred_probas)
    fpr, tpr, roc_auc = dict(), dict(), dict()
    for i, label in enumerate(labels):
        fpr[label], tpr[label], _ = roc_curve(y_test[:,i], y_pred_probas[:,i])
        roc_auc[label] = auc(fpr[label], tpr[label])
        res_svc.append([name, label, acc, loss, roc_auc[label]])

0
1
2
3
4
5
6
7
8
9


In [53]:
log_svc = pd.DataFrame(res_svc, columns=['Classifier','Label', 'Accuracy','Log Loss','AUC'])
log = pd.DataFrame(res, columns=['Classifier','Label', 'Accuracy','Log Loss','AUC'])
a = log_svc.groupby(['Classifier'], as_index=False).mean()
b = log.groupby(['Classifier'], as_index=False).mean()
pd.concat([b,a],axis=0).sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Classifier,Accuracy,Log Loss,AUC
0,SVC,0.848286,0.457025,0.986206
2,KNeighborsClassifier,0.83391,2.89817,0.981071
0,DecisionTreeClassifier,0.627237,12.851316,0.793185
1,ExtraTreeClassifier,0.534577,16.056568,0.74162
