# Logistic Regression Hyperparameter Testing

## Imports

In [20]:
import pandas as pd
import numpy as np
import nltk.stem
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.model_selection import KFold
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load Data

In [2]:
import os
# Change directory to load preprocess script
os.chdir("..")

In [4]:
from data_preprocess import getTrainData
train_data = getTrainData(include_random=True)

## Test

In [5]:
def cross_validation(word_vectorizer, model):
    i = 0
    n = 6
    seed = 42
    
    accuracy = np.zeros(n)
    precision = np.zeros(n)
    recall = np.zeros(n)
    f1 = np.zeros(n)
    mcc = np.zeros(n)
    
    kf = KFold(n_splits=n, shuffle=True, random_state=seed)
    
    X = train_data['text'].array
    y = train_data['label'].array

    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        X_train_cv = word_vectorizer.fit_transform(X_train)
        X_val_cv = word_vectorizer.transform(X_val)
        
        model.fit(X_train_cv, y_train)
        predictions = model.predict(X_val_cv)

        accuracy[i] = accuracy_score(y_val, predictions)
        precision[i] = precision_score(y_val, predictions)
        recall[i] = recall_score(y_val, predictions)
        f1[i] = f1_score(y_val, predictions)
        mcc[i] = matthews_corrcoef(y_val, predictions)
        
        i += 1
    return np.mean(accuracy), np.mean(precision), np.mean(recall), np.mean(f1), np.mean(mcc)

In [7]:
def printResults(result_name, accuracy, precision, recall, f1, mcc):
    print(result_name)
    print('Accuracy score: ', accuracy)
    print('Precision score: ', precision)
    print('Recall score: ', recall)
    print('F1 score: ', f1)
    print('MCC score: ', mcc)

### Word vectorizers

#### CountVectorizer

In [29]:
cv = CountVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
logr = LogisticRegression(random_state=0, C=10, penalty='l2', max_iter=1000)

accuracy, precision, recall, f1, mcc = cross_validation(cv, logr)

printResults("LogisticRegression + CountVectorizer", accuracy, precision, recall, f1, mcc)

LogisticRegression + CountVectorizer
Accuracy score:  0.9190688986617487
Precision score:  0.9146718423474244
Recall score:  0.926079369818992
F1 score:  0.9198532440009762
MCC score:  0.8385098563883505


#### TfidfVectorizer

In [30]:
cv = TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
logr = LogisticRegression(random_state=0, C=10, penalty='l2', max_iter=1000)

accuracy, precision, recall, f1, mcc = cross_validation(cv, logr)

printResults("LogisticRegression + TfidfVectorizer", accuracy, precision, recall, f1, mcc)

LogisticRegression + TfidfVectorizer
Accuracy score:  0.9347472454721711
Precision score:  0.9292959273188618
Recall score:  0.9420055403013351
F1 score:  0.9353809309578166
MCC score:  0.8694109721122997


#### TfidfVectorizer - ngram_range=(1, 2)

In [19]:
cv = TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english', ngram_range=(1, 2))
logr = LogisticRegression(random_state=0, C=10, penalty='l2', max_iter=1000)

accuracy, precision, recall, f1, mcc = cross_validation(cv, logr)

printResults("LogisticRegression + TfidfVectorizer - ngram_range=(1, 2)", accuracy, precision, recall, f1, mcc)

LogisticRegression + TfidfVectorizer - ngram_range=(1, 2)
Accuracy score:  0.9303565517567504
Precision score:  0.9307241176262702
Recall score:  0.9305920465706405
F1 score:  0.9304381948793295
MCC score:  0.8605531379382239


>**Note:** Using ngrams higher than 1 alwas showed decrease in performance

#### TfidfVectorizer - stemming words

In [21]:
stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [27]:
cv = StemmedTfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
logr = LogisticRegression(random_state=0, C=10, penalty='l2', max_iter=1000)

accuracy, precision, recall, f1, mcc = cross_validation(cv, logr)

printResults("LogisticRegression + TfidfVectorizer - stemming words", accuracy, precision, recall, f1, mcc)

LogisticRegression + TfidfVectorizer - stemming words
Accuracy score:  0.9322362510048707
Precision score:  0.9312328417706325
Recall score:  0.9347597837346795
F1 score:  0.9326467159130019
MCC score:  0.8646352354991617


Stemming words also proved to decrease performance and is really slow compared to plain `TfidfVectorizer`, we went with `TfidfVectorizer` ngram = 1

### Hyperparameter testing

In [45]:
C_range = np.arange(5, 22, 2)

solvers = {}
solvers['liblinear'] = {
    'penalty': ['l1', 'l2'],
    'intercept_scaling': np.arange(0.8, 1.3, 0.1)
}

solvers['lbfgs'] = {
    'penalty': ['l2']
}

solvers['saga'] = {
    'penalty': ['l1', 'l2']
}

solvers['saga_elastic'] = {
    'l1_ratio' = np.arange(0, 0.2, 0.02)
}

In [34]:
cv = TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')

 #### liblinear solver

In [35]:
mg = np.array(np.meshgrid(solvers['liblinear']['penalty'], solvers['liblinear']['intercept_scaling'], C_range)).T.reshape(-1,3)

results_arary = []
for params in mg:
    penalty=params[0]
    intercept_scaling=float(params[1])
    C=int(params[2])
    logr = LogisticRegression(solver='liblinear', penalty=penalty, intercept_scaling=intercept_scaling, 
                            C=C, max_iter=1000, random_state=0)
    
    accuracy, precision, recall, f1, mcc = cross_validation(cv, logr)
    
    name = f'liblinear-{params[0]}-{params[1]}-{params[2]}'
    results_arary.append([name , accuracy, precision, recall, f1, mcc])
    
results_liblinear = pd.DataFrame(results_arary, columns = ['model', 'accuracy', 'precision', 'recall', 'f1', 'mcc'])

 #### lbfgs solver

In [43]:
results_arary = []
for C in C_range:
    logr = LogisticRegression(solver='lbfgs', penalty='l2', C=C, max_iter=1000, random_state=0)
    
    accuracy, precision, recall, f1, mcc = cross_validation(cv, logr)
    
    name = f'lbfgs-l2-{C}'
    results_arary.append([name , accuracy, precision, recall, f1, mcc])
    
results_lbfgs_l2 = pd.DataFrame(results_arary, columns = ['model', 'accuracy', 'precision', 'recall', 'f1', 'mcc'])

In [48]:
results_arary = []
logr = LogisticRegression(solver='lbfgs', penalty='none', max_iter=1000, random_state=0)
    
accuracy, precision, recall, f1, mcc = cross_validation(cv, logr)
name = f'lbfgs-none'
results_arary.append([name , accuracy, precision, recall, f1, mcc])
results_lbfgs_none = pd.DataFrame(results_arary, columns = ['model', 'accuracy', 'precision', 'recall', 'f1', 'mcc'])

 #### saga solver

##### l1/l2 penalty

In [51]:
mg = np.array(np.meshgrid(solvers['saga']['penalty'], C_range)).T.reshape(-1,2)

results_arary = []
for params in mg:
    penalty=params[0]
    C=int(params[1])
    logr = LogisticRegression(solver='saga', penalty=penalty, C=C, max_iter=1000, random_state=0)
    
    accuracy, precision, recall, f1, mcc = cross_validation(cv, logr)
    
    name = f'saga-{params[0]}-{params[1]}'
    results_arary.append([name , accuracy, precision, recall, f1, mcc])
    
results_saga = pd.DataFrame(results_arary, columns = ['model', 'accuracy', 'precision', 'recall', 'f1', 'mcc'])





##### elastic net penalty

In [59]:
l1_ratio = np.arange(0, 0.2, 0.02)
mg = np.array(np.meshgrid(solvers['saga_elastic']['l1_ratio'], C_range)).T.reshape(-1,2)

results_arary = []
for params in mg:
    l1_ratio=float(params[0])
    C=int(params[1])
    logr = LogisticRegression(solver='saga', penalty='elasticnet', C=C, l1_ratio=l1_ratio, max_iter=1000, random_state=0)
    
    accuracy, precision, recall, f1, mcc = cross_validation(cv, logr)
    
    name = f'saga-elasticnet-{params[0]}-{params[1]}'
    results_arary.append([name , accuracy, precision, recall, f1, mcc])
    
results_saga_elasticnet = pd.DataFrame(results_arary, columns = ['model', 'accuracy', 'precision', 'recall', 'f1', 'mcc'])

#### Results

In [60]:
results = pd.concat([results_liblinear, results_lbfgs_l2, results_lbfgs_none, results_saga, results_saga_elasticnet], ignore_index=True)

results = results.sort_values(by=['f1', 'mcc'], ascending=False)

results.head(15)

Unnamed: 0,model,accuracy,precision,recall,f1,mcc
65,liblinear-l2-0.8-17,0.935998,0.93073,0.94335,0.936779,0.871859
66,liblinear-l2-0.9-17,0.935998,0.93073,0.94335,0.936779,0.871859
67,liblinear-l2-1.0-17,0.935998,0.93073,0.94335,0.936779,0.871859
68,liblinear-l2-1.1-17,0.935998,0.93073,0.94335,0.936779,0.871859
69,liblinear-l2-1.2-17,0.935998,0.93073,0.94335,0.936779,0.871859
96,lbfgs-l2-17,0.935998,0.93073,0.94335,0.936779,0.871859
85,liblinear-l2-0.8-21,0.935998,0.931851,0.942106,0.936743,0.87181
86,liblinear-l2-0.9-21,0.935998,0.931851,0.942106,0.936743,0.87181
87,liblinear-l2-1.0-21,0.935998,0.931851,0.942106,0.936743,0.87181
88,liblinear-l2-1.1-21,0.935998,0.931851,0.942106,0.936743,0.87181


##### Try parameters `C=16` and `C=18` for `lbfgs`

In [63]:
logr = LogisticRegression(solver='lbfgs', penalty='l2', C=16, max_iter=1000, random_state=0)
    
cross_validation(cv, logr)

(0.9353714474866411,
 0.9306110133467745,
 0.9420055403013351,
 0.9360738964027515,
 0.8705455628422282)

In [64]:
logr = LogisticRegression(solver='lbfgs', penalty='l2', C=18, max_iter=1000, random_state=0)
    
cross_validation(cv, logr)

(0.9359980139026812,
 0.9307302739311515,
 0.9433496263228404,
 0.9367793418189124,
 0.8718586154325507)

None of the previously untested `C=16` and `C=18` are better than `C=17`

## Conclusion

* **Word vectorizer:** Best results were obtained by `TfidfVectorizer`.

* **Model:** The best result was produced by solver `lbfgs` and `liblinear` with parameter `C=17` and `penalty='l2'`. We will use `lbfgs` for our models.