In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.svm import SVC

import pickle
import numpy as np

import sys
sys.path.append('..')

from process_dataset import speech_features

## Methods

In [10]:
def get_train_test():
    with open('../data/speech_features.pkl', 'rb') as f:
        data = pickle.load(f)

    x = np.array(data[0])
    y = np.array(data[1])
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

    return x_train, x_test, y_train, y_test

def get_x_y():
    x_train, x_test, y_train, y_test = get_train_test()

    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)

    return x_train, y_train

x, y = get_x_y()

def check_accuracy(model):
    x_train, x_test, y_train, y_test = get_train_test()

    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    model.fit(x_train, y_train)
    results = model.predict(x_test)

    print(classification_report(y_test, results, digits=4))

## Logistic Regression

In [3]:
lr = LogisticRegression(penalty='l1', solver='liblinear')
lr = LogisticRegression(C=0.6, class_weight='balanced', random_state=42, solver='liblinear')

check_accuracy(lr)

              precision    recall  f1-score   support

         ang       0.64      0.71      0.67       208
         hap       0.65      0.57      0.61       317
         neu       0.71      0.59      0.64       369
         sad       0.58      0.80      0.67       213

    accuracy                           0.65      1107
   macro avg       0.65      0.67      0.65      1107
weighted avg       0.66      0.65      0.64      1107



#### Random Search

In [38]:
params = {
    'solver': ['liblinear', 'saga', 'sag', 'newton-cg'],
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : [0.001, 0.01, 0.1, 0.6, 1, 10, 30, 50],
    'fit_intercept': [True, False],
    'class_weight': ['balanced', None],
    'multi_class': ['auto', 'ovr', 'multinomial']
}

In [None]:
lr_g = RandomizedSearchCV(LogisticRegression(random_state=42), param_distributions=params, n_iter=50, n_jobs=-1, cv=5, random_state=42, verbose=5)

lr_g.fit(x, y)

In [44]:
print(lr_g.best_params_)
print(lr_g.best_score_)
print(lr_g.best_estimator_)

{'solver': 'liblinear', 'penalty': 'l1', 'multi_class': 'auto', 'fit_intercept': False, 'class_weight': 'balanced', 'C': 1}
0.633877135636938
LogisticRegression(C=1, class_weight='balanced', fit_intercept=False,
                   penalty='l1', random_state=42, solver='liblinear')


#### Grid Search

In [29]:
params = {
    'penalty': ['l2'],
    'solver' : ['liblinear'],
    'C': [0.75, 0.5, 0.85, 0.7, 0.6]
}
lr_g = GridSearchCV(LogisticRegression(random_state=42, multi_class='auto', fit_intercept=True, class_weight='balanced'), param_grid=params, cv=5, return_train_score=False, verbose=5, n_jobs=-1)

pg = ParameterGrid(params)
print(len(pg), 'combinations per fold')


5 combinations per fold


In [None]:
lr_g.fit(x, y)


In [31]:
print(lr_g.best_score_)
print(lr_g.best_params_)
print(lr_g.best_estimator_)

0.6355033152501507
{'C': 0.6, 'penalty': 'l2', 'solver': 'liblinear'}
LogisticRegression(C=0.6, class_weight='balanced', random_state=42,
                   solver='liblinear')


## SVM

In [4]:
# svm = SVC(random_state=42)
# svm = SVC(C=0.5, decision_function_shape='ovo', degree=5, kernel='linear', probability=True, random_state=42)
svm = SVC(C=0.5, decision_function_shape='ovo', degree=5, kernel='linear', probability=True, random_state=42)
check_accuracy(svm)

              precision    recall  f1-score   support

         ang       0.64      0.66      0.65       208
         hap       0.59      0.57      0.58       317
         neu       0.70      0.62      0.65       369
         sad       0.62      0.75      0.68       213

    accuracy                           0.64      1107
   macro avg       0.64      0.65      0.64      1107
weighted avg       0.64      0.64      0.64      1107



#### Random Search

In [56]:
params = {
    'C': [0.01, 0.1, 0.5, 1, 5, 10, 20],
    'kernel' : ['linear'],
    'degree': [1, 3, 5, 7],
    'shrinking': [True, False],
    'probability': [True],
    'class_weight': ['balanced', None],
    'decision_function_shape': ['ovo', 'ovr']
}

In [None]:
svm_g = RandomizedSearchCV(SVC(random_state=42), param_distributions=params, n_iter=25, n_jobs=-1, cv=3, random_state=42, verbose=5)

svm_g.fit(x, y)

In [58]:
print(svm_g.best_params_)
print(svm_g.best_score_)
print(svm_g.best_estimator_)

{'shrinking': True, 'probability': True, 'kernel': 'linear', 'degree': 5, 'decision_function_shape': 'ovo', 'class_weight': None, 'C': 0.5}
0.6228514490152298
SVC(C=0.5, decision_function_shape='ovo', degree=5, kernel='linear',
    probability=True, random_state=42)


#### Grid Search

In [65]:
params = {
    'C': [0.5, 0.75, 0.85],
    'kernel' : ['linear'],
    'degree': [5, 6, 7],
    'probability': [True],
    'class_weight': ['balanced', None],
    'decision_function_shape': ['ovo', 'ovr']
}
svm_g = GridSearchCV(SVC(random_state=42), param_grid=params, cv=3, return_train_score=False, verbose=5, n_jobs=-1)

pg = ParameterGrid(params)
print(len(pg), 'combinations per fold')

36 combinations per fold


In [None]:
svm_g.fit(x, y)

In [67]:
print(svm_g.best_params_)
print(svm_g.best_score_)
print(svm_g.best_estimator_)

{'shrinking': True, 'probability': True, 'kernel': 'linear', 'degree': 5, 'decision_function_shape': 'ovo', 'class_weight': None, 'C': 0.5}
0.6228514490152298
SVC(C=0.5, decision_function_shape='ovo', degree=5, kernel='linear',
    probability=True, random_state=42)


## Random Forest

In [7]:
# rf = RandomForestClassifier(random_state=42)
rf = RandomForestClassifier(bootstrap=False, criterion='gini', max_depth=20, max_features=0.3, min_samples_split=10, n_estimators=150, random_state=42, n_jobs=-1)
check_accuracy(rf)

              precision    recall  f1-score   support

         ang       0.73      0.64      0.68       208
         hap       0.59      0.54      0.56       317
         neu       0.60      0.66      0.63       369
         sad       0.63      0.66      0.65       213

    accuracy                           0.62      1107
   macro avg       0.64      0.63      0.63      1107
weighted avg       0.63      0.62      0.62      1107



#### Random Search

In [None]:
params = {
    'n_estimators': [10, 50, 85, 100, 150, 200, 500, 1000, 1500],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
    'max_depth': [int(x) for x in np.linspace(10, 120, num = 12)],
    'max_features': ['auto', 'sqrt', 'log2', 0.2,],
    'min_samples_leaf': [1, 3, 5, 8, 12],
    'min_samples_split' : [2, 6, 10, 15, 20]
}

rf_g = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=params, n_iter=50, n_jobs=-1, cv=5, random_state=42, verbose=5)

rf_g.fit(x, y)

In [6]:
print(rf_g.best_params_)
print(rf_g.best_score_)
print(rf_g.best_estimator_)

{'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 0.2, 'max_depth': 120, 'criterion': 'entropy', 'bootstrap': False}
0.621584818620941
RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=120,
                       max_features=0.2, min_samples_split=10,
                       n_estimators=1500, random_state=42)
