### Quora Insincere Questions: Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
import os
os.chdir('/home/roman/Documents/Projects/Quora/data')

In [2]:
# load and prepare data:
data = pd.read_csv('train.csv')
X_train, X_val, y_train, y_val = train_test_split(data['question_text'], data['target'], test_size=0.1)

In [3]:
# --------------- CountVectorizer --------------- #
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(data['question_text'])
X_train_count_vec = count_vectorizer.transform(X_train)
X_val_count_vec = count_vectorizer.transform(X_val)
model = LogisticRegression()
model.fit(X_train_count_vec, y_train)
y_train_count_pred = model.predict(X_train_count_vec)
y_val_count_pred = model.predict(X_val_count_vec)

In [4]:
# --------------- TFIDF Vectorizer --------------- #
tfidf_vectorizer = TfidfVectorizer(min_df=3, strip_accents='unicode', token_pattern=r'\w{1,}', \
                        ngram_range=(1, 3), sublinear_tf=1, stop_words='english')
tfidf_vectorizer.fit(data['question_text'])
X_train_tfidf_vec = tfidf_vectorizer.transform(X_train)
X_val_tfidf_vec = tfidf_vectorizer.transform(X_val)
model = LogisticRegression()
model.fit(X_train_tfidf_vec, y_train)
y_train_tfidf_pred = model.predict(X_train_tfidf_vec)
y_val_tfidf_pred = model.predict(X_val_tfidf_vec)

In [5]:
# --------------- Evaluate Results --------------- #
predicted_positive = np.round(np.mean(y_train_count_pred), 3)
y_pred_random = np.random.binomial(1, predicted_positive, y_train.shape[0])
actual_positve = np.round(np.mean(y_train), 3)
precision_train = np.round(metrics.precision_score(y_train, y_train_count_pred), 3)
precision_val = np.round(metrics.precision_score(y_val, y_val_count_pred), 3)
precision_random = np.round(metrics.precision_score(y_train, y_pred_random), 3)
recall_train = np.round(metrics.recall_score(y_train, y_train_count_pred), 3)
recall_val = np.round(metrics.recall_score(y_val, y_val_count_pred), 3)
recall_random = np.round(metrics.recall_score(y_train, y_pred_random), 3)
f1score_train = np.round(metrics.f1_score(y_train, y_train_count_pred), 3)
f1score_val = np.round(metrics.f1_score(y_val, y_val_count_pred), 3)
f1score_random = np.round(metrics.f1_score(y_train, y_pred_random), 3)
print('Evaluation Logistic Regression with CountVectorizer:')
print('actual positive:    ' + str(np.round(np.mean(y_train), 3)))
print('predicted positive: ' + str(np.round(np.mean(y_train_count_pred), 3)))
print('precision (train/val/random): ' + str(precision_train) + ' / ' + \
      str(precision_val) + ' / ' + str(precision_random))
print('recall (train/val/random):    ' + str(recall_train) + ' / ' + \
      str(recall_val) + ' / ' + str(recall_random))
print('f1 score (train/val/random):  ' + str(f1score_train) + ' / ' + \
      str(f1score_val) + ' / ' + str(f1score_random))
print('\n')
predicted_positive = np.round(np.mean(y_train_tfidf_pred), 3)
y_pred_random = np.random.binomial(1, predicted_positive, y_train.shape[0])
precision_train = np.round(metrics.precision_score(y_train, y_train_tfidf_pred), 3)
precision_val = np.round(metrics.precision_score(y_val, y_val_tfidf_pred), 3)
precision_random = np.round(metrics.precision_score(y_train, y_pred_random), 3)
recall_train = np.round(metrics.recall_score(y_train, y_train_tfidf_pred), 3)
recall_val = np.round(metrics.recall_score(y_val, y_val_tfidf_pred), 3)
recall_random = np.round(metrics.recall_score(y_train, y_pred_random), 3)
f1score_train = np.round(metrics.f1_score(y_train, y_train_tfidf_pred), 3)
f1score_val = np.round(metrics.f1_score(y_val, y_val_tfidf_pred), 3)
f1score_random = np.round(metrics.f1_score(y_train, y_pred_random), 3)
print('Evaluation Logistic Regression with TFIDFVectorizer:')
print('actual positive:    ' + str(np.round(np.mean(y_train), 3)))
print('predicted positive: ' + str(np.round(np.mean(y_train_tfidf_pred), 3)))
print('precision (train/val/random): ' + str(precision_train) + ' / ' + \
      str(precision_val) + ' / ' + str(precision_random))
print('recall (train/val/random):    ' + str(recall_train) + ' / ' + \
      str(recall_val) + ' / ' + str(recall_random))
print('f1 score (train/val/random):  ' + str(f1score_train) + ' / ' + \
      str(f1score_val) + ' / ' + str(f1score_random))
print('\n')

Evaluation Logistic Regression with CountVectorizer:
actual positive:    0.062
predicted positive: 0.039
precision (train/val/random): 0.753 / 0.684 / 0.06
recall (train/val/random):    0.472 / 0.414 / 0.038
f1 score (train/val/random):  0.58 / 0.516 / 0.046


Evaluation Logistic Regression with TFIDFVectorizer:
actual positive:    0.062
predicted positive: 0.034
precision (train/val/random): 0.774 / 0.693 / 0.061
recall (train/val/random):    0.426 / 0.363 / 0.034
f1 score (train/val/random):  0.55 / 0.477 / 0.043




With a logistic regression model the simpler and faster CountVectorizer performs slightly better than a TFIDF Vectorizer