In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold

import pickle
import numpy as np

import sys
sys.path.append('..')

from process_dataset import speech_features

In [2]:
def print_scores(scores):
    print('Accuracy: ', np.mean(scores['test_accuracy']))
    print('F1 Macro: ', np.mean(scores['test_f1_macro']))
    print('Precision Macro: ', np.mean(scores['test_precision_macro']))
    print('Recall Macro: ', np.mean(scores['test_recall_macro']))

def get_data():
    with open('../data/speech_features.pkl', 'rb') as f:
        data = pickle.load(f)

    x = np.array(data[0])
    y = np.array(data[1])
    
    x = MinMaxScaler().fit_transform(x)

    return x, y

x, y = get_data()

def cross_validate_model(model):
    x, y = get_data()
    scoring = {'accuracy': 'accuracy',
           'f1_macro': 'f1_macro',
           'precision_macro': 'precision_macro',
           'recall_macro' : 'recall_macro'}

    scores = cross_validate(model, x, y, cv=5, scoring=scoring, n_jobs=-1)
    print_scores(scores)

def check_accuracy(model):
    x, y = get_data()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    model.fit(x_train, y_train)
    results = model.predict(x_test)

    print(classification_report(y_test, results))

## Logistic Regression

In [125]:
lr = LogisticRegression(penalty='l1', solver='liblinear')
# cross_validate_model(lr)
check_accuracy(lr)

Accuracy:  0.6320684906668235
F1 Macro:  0.6377622648377621
Precision Macro:  0.6391974740933859
Recall Macro:  0.6382607778550325


### Random Search

In [72]:
params = {
    'solver': ['liblinear', 'saga', 'sag'],
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'C' : [0.001, 0.01, 0.1, 1, 10, 100]
}

In [73]:
lr_g = RandomizedSearchCV(LogisticRegression(random_state=42), param_distributions=params, n_iter=40, n_jobs=-1, cv=5, random_state=42, verbose=5)

lr_g.fit(x, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




[CV 4/5] END ......penalty=l1, solver=liblinear;, score=0.505 total time=  34.7s
[CV 5/5] END ......penalty=l1, solver=liblinear;, score=0.482 total time=  41.1s
[CV 1/5] END ......penalty=l1, solver=liblinear;, score=0.467 total time=  42.9s
[CV 1/5] END ..............penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END ..............penalty=l1, solver=sag;, score=nan total time=   0.1s
[CV 3/5] END ..............penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END ..............penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END ..............penalty=l1, solver=sag;, score=nan total time=   0.0s




[CV 1/5] END ...........penalty=l1, solver=saga;, score=0.468 total time=  48.4s
[CV 3/5] END ......penalty=l1, solver=liblinear;, score=0.428 total time=  49.1s
[CV 1/5] END ......penalty=l2, solver=liblinear;, score=0.472 total time=   6.4s




[CV 3/5] END ...........penalty=l1, solver=saga;, score=0.437 total time=  49.4s
[CV 2/5] END ...........penalty=l1, solver=saga;, score=0.441 total time=  49.7s
[CV 2/5] END ......penalty=l1, solver=liblinear;, score=0.445 total time=  52.0s
[CV 2/5] END ......penalty=l2, solver=liblinear;, score=0.440 total time=   9.1s
[CV 3/5] END ......penalty=l2, solver=liblinear;, score=0.426 total time=   9.1s
[CV 5/5] END ......penalty=l2, solver=liblinear;, score=0.467 total time=   8.6s
[CV 4/5] END ......penalty=l2, solver=liblinear;, score=0.495 total time=   9.3s




[CV 1/5] END ............penalty=l2, solver=sag;, score=0.469 total time=  19.5s




[CV 4/5] END ...........penalty=l1, solver=saga;, score=0.505 total time=  46.2s




[CV 1/5] END ...........penalty=l2, solver=saga;, score=0.467 total time=  32.3s




[CV 5/5] END ...........penalty=l1, solver=saga;, score=0.472 total time=  45.6s
[CV 2/5] END ...........penalty=l2, solver=saga;, score=0.428 total time=  34.9s
[CV 1/5] END penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/5] END .....penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 2/5] END .....penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 3/5] END .....penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 4/5] END .....penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 5/5] END .....penalty=elasticnet, solver=saga;, score=nan total time=   0.0s
[CV 1/5] END ......penalty=e



[CV 4/5] END ...........penalty=l2, solver=saga;, score=0.491 total time=  30.7s




[CV 5/5] END ...........penalty=l2, solver=saga;, score=0.463 total time=  32.4s




[CV 3/5] END ...........penalty=l2, solver=saga;, score=0.432 total time=  33.9s




[CV 2/5] END ............penalty=l2, solver=sag;, score=0.430 total time=  16.8s




[CV 4/5] END ............penalty=l2, solver=sag;, score=0.491 total time=  15.2s




[CV 3/5] END ............penalty=l2, solver=sag;, score=0.430 total time=  16.8s


20 fits failed out of a total of 45.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/rafid/Documents/github/CSE400-NLP/env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/rafid/Documents/github/CSE400-NLP/env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/rafid/Documents/github/CSE400-NLP/env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag s

[CV 5/5] END ............penalty=l2, solver=sag;, score=0.463 total time=  13.6s


RandomizedSearchCV(cv=5, estimator=LogisticRegression(random_state=42),
                   n_iter=40, n_jobs=-1,
                   param_distributions={'penalty': ['l1', 'l2', 'elasticnet'],
                                        'solver': ['liblinear', 'saga', 'sag']},
                   random_state=42, verbose=5)

In [74]:
print(lr_g.best_params_)
print(lr_g.best_score_)
print(lr_g.best_estimator_)

{'solver': 'liblinear', 'penalty': 'l1'}
0.4655826558265582
LogisticRegression(penalty='l1', random_state=42, solver='liblinear')


In [None]:
lr_g = GridSearchCV(LogisticRegression(solver='newton-cg', random_state=42), param_grid={
    'penalty': ['l2', 'none'],
    'multi_class': ['auto', 'ovr', 'multinomial'],
    'class_weight': ['balanced', None],
    'C': np.logspace(-4, 4, 7)
}, cv=3, return_train_score=False, verbose=10, n_jobs=-1)

lr_g.fit(x, y)


In [None]:
print(lr_g.best_score_)
print(lr_g.best_params_)

## SVM

In [None]:
svm = SVC(kernel='linear', probability=True, random_state=42)
test_accuracy(svm)

## Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
test_accuracy(rf)