# Logistic Regression Results

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.model_selection import KFold

## Load Data

In [2]:
import os
# Change directory to load preprocess script
os.chdir("..")

In [8]:
from data_preprocess import getTrainData
train_data_all = getTrainData(include_random=True) # article title + body
train_data_title = getTrainData(include_random=True, n_sentences=0) # article title
train_data_body = getTrainData(include_random=True, no_title=True) # article body
train_data_titleplus = getTrainData(include_random=True, n_sentences=1) # article title + 1st sentence

## Test

In [9]:
def cross_validation(word_vectorizer, model, train_data):
    i = 0
    n = 6
    seed = 42
    
    accuracy = np.zeros(n)
    precision = np.zeros(n)
    recall = np.zeros(n)
    f1 = np.zeros(n)
    mcc = np.zeros(n)
    
    kf = KFold(n_splits=n, shuffle=True, random_state=seed)
    
    X = train_data['text'].array
    y = train_data['label'].array

    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        X_train_cv = word_vectorizer.fit_transform(X_train)
        X_val_cv = word_vectorizer.transform(X_val)
        
        model.fit(X_train_cv, y_train)
        predictions = model.predict(X_val_cv)

        accuracy[i] = accuracy_score(y_val, predictions)
        precision[i] = precision_score(y_val, predictions)
        recall[i] = recall_score(y_val, predictions)
        f1[i] = f1_score(y_val, predictions)
        mcc[i] = matthews_corrcoef(y_val, predictions)
        
        i += 1
    return np.mean(accuracy), np.mean(precision), np.mean(recall), np.mean(f1), np.mean(mcc)

In [15]:
def printResults(result_name, results):
    print(result_name)
    print('Accuracy score: ', results[0])
    print('Precision score: ', results[1])
    print('Recall score: ', results[2])
    print('F1 score: ', results[3])
    print('MCC score: ', results[4])

### All - title + body

In [51]:
cv = TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
logr = LogisticRegression(random_state=0, C=17, penalty='l2', max_iter=1000)

results_all = cross_validation(cv, logr, train_data_all)

### Title

In [44]:
cv = TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
logr = LogisticRegression(random_state=0, C=17, penalty='l2', max_iter=1000)

results_title = cross_validation(cv, logr, train_data_title)

### Body

In [45]:
cv = TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
logr = LogisticRegression(random_state=0, C=17, penalty='l2', max_iter=1000)

results_body = cross_validation(cv, logr, train_data_body)

### Titleplus - title + 1st sentence

In [46]:
cv = TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
logr = LogisticRegression(random_state=0, C=17, penalty='l2', max_iter=1000)

results_titleplus = cross_validation(cv, logr, train_data_titleplus)

### Result

In [47]:
results = pd.DataFrame(np.array([results_all, results_title, results_body, results_titleplus]), columns = ['accuracy', 'precision', 'recall', 'f1', 'mcc'])
results["models"] = ["results_all", "results_title", "results_body", "results_titleplus"]

results = results[['models', 'accuracy', 'precision', 'recall', 'f1', 'mcc']]
results = results.sort_values(by=['f1', 'mcc'], ascending=False)
results

Unnamed: 0,models,accuracy,precision,recall,f1,mcc
2,results_body,0.937258,0.931932,0.944421,0.937907,0.874391
0,results_all,0.935998,0.93073,0.94335,0.936779,0.871859
3,results_titleplus,0.885795,0.89866,0.871399,0.884333,0.772303
1,results_title,0.853797,0.867991,0.839682,0.852568,0.709411


## Conclusion

Results on body (`results_body`) can be seen here as the best by a small margin. Throughout the project we noticed that results using title + body had the best results. For the main logistic regression model we chose the one trained on title + body (`results_all`).
