# Logistic Regression Ensembles

>**Note:** This was run in Google Colab, so there is no direct reference to the data. The data used was the same as in repository.

## Imports

In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

## Load Data

In [2]:
import os
# Change directory to load preprocess script
os.chdir("..")

In [4]:
from data_preprocess import getTrainData
train_data_all = getTrainData(include_random=True) # article title + body
train_data_body = getTrainData(include_random=True, no_title=True) # article body

## Test

In [5]:
def cross_validation(word_vectorizer, model, train_data):
    i = 0
    n = 6
    seed = 42
    
    accuracy = np.zeros(n)
    precision = np.zeros(n)
    recall = np.zeros(n)
    f1 = np.zeros(n)
    mcc = np.zeros(n)
    
    kf = KFold(n_splits=n, shuffle=True, random_state=seed)
    
    X = train_data['text'].array
    y = train_data['label'].array

    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        X_train_cv = word_vectorizer.fit_transform(X_train)
        X_val_cv = word_vectorizer.transform(X_val)
        
        model.fit(X_train_cv, y_train)
        predictions = model.predict(X_val_cv)

        accuracy[i] = accuracy_score(y_val, predictions)
        precision[i] = precision_score(y_val, predictions)
        recall[i] = recall_score(y_val, predictions)
        f1[i] = f1_score(y_val, predictions)
        mcc[i] = matthews_corrcoef(y_val, predictions)
        
        i += 1
    return np.mean(accuracy), np.mean(precision), np.mean(recall), np.mean(f1), np.mean(mcc)

In [6]:
def printResults(result_name, results):
    print(result_name)
    print('Accuracy score: ', results[0])
    print('Precision score: ', results[1])
    print('Recall score: ', results[2])
    print('F1 score: ', results[3])
    print('MCC score: ', results[4])

### BaggingClassifier

In [21]:
cv = TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
logr = LogisticRegression(random_state=0, C=17, penalty='l2', max_iter=1000)

bagger = BaggingClassifier(random_state=0, base_estimator=logr, n_estimators=7, max_samples=0.8)
results_bagging = cross_validation(cv, bagger, train_data_all)

### AdaBoostClassifier

In [22]:
cv = TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
logr = LogisticRegression(random_state=0, C=17, penalty='l2', max_iter=1000)

abc = AdaBoostClassifier(random_state=0, n_estimators=100, learning_rate=1, base_estimator=logr)
results_abc = cross_validation(cv, abc, train_data_all)

### Results

In [23]:
results = pd.DataFrame(np.array([results_bagging, results_abc]), columns = ['accuracy', 'precision', 'recall', 'f1', 'mcc'])
results["models"] = ["results_bagging", "results_abc"]

results = results[['models', 'accuracy', 'precision', 'recall', 'f1', 'mcc']]
results = results.sort_values(by=['f1', 'mcc'], ascending=False)
results

Unnamed: 0,models,accuracy,precision,recall,f1,mcc
0,results_bagging,0.926588,0.927157,0.927668,0.927026,0.853414
1,results_abc,0.924715,0.924422,0.925626,0.924709,0.849522


## Conclusion

We do not see improvements of base score (F1: 0.936779) using tested ensembles on logistic regression.

> Note: We tried different parameters aswell.