In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.svm import SVC

import pickle
import numpy as np

import sys
sys.path.append('..')

from process_dataset import speech_features

## Methods

In [2]:
def get_train_test():
    with open('../data/speech_features.pkl', 'rb') as f:
        data = pickle.load(f)

    x = np.array(data[0])
    y = np.array(data[1])
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

    return x_train, x_test, y_train, y_test

def get_x_y():
    x_train, x_test, y_train, y_test = get_train_test()

    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)

    return x_train, y_train

def check_accuracy(model):
    x_train, x_test, y_train, y_test = get_train_test()

    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    model.fit(x_train, y_train)
    results = model.predict(x_test)

    print(classification_report(y_test, results, digits=4))


x, y = get_x_y()

## Logistic Regression

In [3]:
lr = LogisticRegression(penalty='l1', solver='liblinear')
# lr = LogisticRegression(C=0.6, class_weight='balanced', random_state=42, solver='liblinear')

check_accuracy(lr)

              precision    recall  f1-score   support

         ang     0.6603    0.6635    0.6619       208
         hap     0.6117    0.5962    0.6038       317
         neu     0.6727    0.6070    0.6382       369
         sad     0.6094    0.7324    0.6652       213

    accuracy                         0.6387      1107
   macro avg     0.6385    0.6498    0.6423      1107
weighted avg     0.6407    0.6387    0.6380      1107



#### Random Search

In [17]:
params = {
    'solver': ['liblinear', 'saga', 'sag'],
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : [0.01, 0.1, 0.6, 1, 10, 30],
    'fit_intercept': [True, False],
    'class_weight': ['balanced', None],
    'multi_class': ['auto', 'ovr', 'multinomial'],
    'max_iter': [2000]
}

In [None]:
lr_g = RandomizedSearchCV(LogisticRegression(random_state=42), param_distributions=params, n_iter=50, n_jobs=-1, cv=5, random_state=42, verbose=5)

lr_g.fit(x, y)

In [14]:
print(lr_g.best_params_)
print(lr_g.best_score_)
print(lr_g.best_estimator_)

{'solver': 'saga', 'penalty': 'l1', 'multi_class': 'auto', 'max_iter': 2000, 'fit_intercept': True, 'class_weight': None, 'C': 0.6}
0.636981874888156
LogisticRegression(C=0.6, max_iter=2000, penalty='l1', random_state=42,
                   solver='saga')


#### Grid Search

In [18]:
params1 = {
    'penalty': ['l2', 'l1'],
    'solver' : ['liblinear', 'saga'],
    'C': [0.5, 0.7, 0.6],
    'max_iter': [2000]
}
lr_g1 = GridSearchCV(LogisticRegression(random_state=42), param_grid=params1, cv=5, return_train_score=False, verbose=5, n_jobs=-1)

pg = ParameterGrid(params1)
print(len(pg), 'combinations per fold')


12 combinations per fold


In [None]:
lr_g1.fit(x, y)


In [21]:
print(lr_g1.best_score_)
print(lr_g1.best_params_)
print(lr_g1.best_estimator_)

0.636981874888156
{'C': 0.6, 'max_iter': 2000, 'penalty': 'l1', 'solver': 'saga'}
LogisticRegression(C=0.6, max_iter=2000, penalty='l1', random_state=42,
                   solver='saga')


In [24]:
lr = LogisticRegression(C=1, max_iter=2000, penalty='l1', random_state=42, solver='saga')
check_accuracy(lr)

              precision    recall  f1-score   support

         ang     0.6618    0.6490    0.6553       208
         hap     0.5981    0.5868    0.5924       317
         neu     0.6706    0.6233    0.6461       369
         sad     0.6305    0.7371    0.6797       213

    accuracy                         0.6396      1107
   macro avg     0.6402    0.6490    0.6434      1107
weighted avg     0.6404    0.6396    0.6389      1107



## SVM

In [4]:
# svm = SVC(random_state=42)
# svm = SVC(C=0.5, decision_function_shape='ovo', degree=5, kernel='linear', probability=True, random_state=42)
svm = SVC(C=0.5, decision_function_shape='ovo', degree=5, kernel='linear', probability=True, random_state=42)
check_accuracy(svm)

              precision    recall  f1-score   support

         ang       0.64      0.66      0.65       208
         hap       0.59      0.57      0.58       317
         neu       0.70      0.62      0.65       369
         sad       0.62      0.75      0.68       213

    accuracy                           0.64      1107
   macro avg       0.64      0.65      0.64      1107
weighted avg       0.64      0.64      0.64      1107



#### Random Search

In [56]:
params = {
    'C': [0.01, 0.1, 0.5, 1, 5, 10, 20],
    'kernel' : ['linear'],
    'degree': [1, 3, 5, 7],
    'shrinking': [True, False],
    'probability': [True],
    'class_weight': ['balanced', None],
    'decision_function_shape': ['ovo', 'ovr']
}

In [None]:
svm_g = RandomizedSearchCV(SVC(random_state=42), param_distributions=params, n_iter=25, n_jobs=-1, cv=3, random_state=42, verbose=5)

svm_g.fit(x, y)

In [58]:
print(svm_g.best_params_)
print(svm_g.best_score_)
print(svm_g.best_estimator_)

{'shrinking': True, 'probability': True, 'kernel': 'linear', 'degree': 5, 'decision_function_shape': 'ovo', 'class_weight': None, 'C': 0.5}
0.6228514490152298
SVC(C=0.5, decision_function_shape='ovo', degree=5, kernel='linear',
    probability=True, random_state=42)


#### Grid Search

In [65]:
params = {
    'C': [0.5, 0.75, 0.85],
    'kernel' : ['linear'],
    'degree': [5, 6, 7],
    'probability': [True],
    'class_weight': ['balanced', None],
    'decision_function_shape': ['ovo', 'ovr']
}
svm_g = GridSearchCV(SVC(random_state=42), param_grid=params, cv=3, return_train_score=False, verbose=5, n_jobs=-1)

pg = ParameterGrid(params)
print(len(pg), 'combinations per fold')

36 combinations per fold


In [None]:
svm_g.fit(x, y)

In [67]:
print(svm_g.best_params_)
print(svm_g.best_score_)
print(svm_g.best_estimator_)

{'shrinking': True, 'probability': True, 'kernel': 'linear', 'degree': 5, 'decision_function_shape': 'ovo', 'class_weight': None, 'C': 0.5}
0.6228514490152298
SVC(C=0.5, decision_function_shape='ovo', degree=5, kernel='linear',
    probability=True, random_state=42)


## Random Forest

In [3]:
rf = RandomForestClassifier(random_state=42)
# rf = RandomForestClassifier(bootstrap=False, criterion='gini', max_depth=20, max_features=0.3, min_samples_split=10, n_estimators=150, random_state=42, n_jobs=-1)
check_accuracy(rf)

              precision    recall  f1-score   support

         ang     0.7195    0.5673    0.6344       208
         hap     0.5855    0.5615    0.5733       317
         neu     0.5829    0.6477    0.6136       369
         sad     0.6114    0.6573    0.6335       213

    accuracy                         0.6098      1107
   macro avg     0.6248    0.6084    0.6137      1107
weighted avg     0.6148    0.6098    0.6098      1107



#### Random Search

In [None]:
params = {
    'n_estimators': [10, 50, 85, 100, 150, 200, 500, 1000, 1500],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
    'max_depth': [int(x) for x in np.linspace(10, 120, num = 12)],
    'max_features': ['auto', 'sqrt', 'log2', 0.2,],
    'min_samples_leaf': [1, 3, 5, 8, 12],
    'min_samples_split' : [2, 6, 10, 15, 20]
}

rf_g = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=params, n_iter=50, n_jobs=-1, cv=5, random_state=42, verbose=5)

rf_g.fit(x, y)

In [6]:
print(rf_g.best_params_)
print(rf_g.best_score_)
print(rf_g.best_estimator_)

{'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 0.2, 'max_depth': 120, 'criterion': 'entropy', 'bootstrap': False}
0.6200273538359281
RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=120,
                       max_features=0.2, min_samples_split=10,
                       n_estimators=1500, random_state=42)


#### Grid Search

In [9]:
params1 = {
    'n_estimators': [1500, 150, 200],
    'min_samples_split': [10],
    'min_samples_leaf': [1, 5],
    'max_features': [0.2, 0.3],
    'max_depth': [120, 500],
    'criterion': ['entropy', 'gini'],
    'bootstrap': [False],
    'n_jobs': [-1]
}

rf_g1 = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=params1, cv=5, return_train_score=False, verbose=5, n_jobs=-1)

pg = ParameterGrid(params1)
print(len(pg), 'combinations per fold')


48 combinations per fold


In [None]:
rf_g1.fit(x, y)

In [11]:
print(rf_g1.best_score_)
print(rf_g1.best_params_)
print(rf_g1.best_estimator_)

0.6209307973515352
{'bootstrap': False, 'criterion': 'gini', 'max_depth': 120, 'max_features': 0.3, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 1500, 'n_jobs': -1}
RandomForestClassifier(bootstrap=False, max_depth=120, max_features=0.3,
                       min_samples_split=10, n_estimators=1500, n_jobs=-1,
                       random_state=42)


In [17]:
rf = RandomForestClassifier(bootstrap=False, max_depth=120, max_features=0.3,
                       min_samples_split=10, n_estimators=1500, n_jobs=-1,
                       random_state=42)
check_accuracy(rf)

              precision    recall  f1-score   support

         ang     0.7557    0.6394    0.6927       208
         hap     0.6087    0.5741    0.5909       317
         neu     0.6019    0.6721    0.6351       369
         sad     0.6364    0.6573    0.6467       213

    accuracy                         0.6350      1107
   macro avg     0.6507    0.6357    0.6413      1107
weighted avg     0.6394    0.6350    0.6355      1107

