### Quora Insincere Questions: Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, fbeta_score, make_scorer
from sklearn import metrics
from pylab import rcParams
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
import os
os.chdir('/home/roman/Documents/Projects/Quora/data')

In [2]:
# --------------- Embedding / Vectorizer --------------- #
embedding = 'w2v'   # <-- Specify 'cv', 'tfidf' or 'w2v'
embedding_dict = {'cv': 'CountVectorizer', 'tfidf': 'TFIDF Vectorizer', 'w2v': 'Word2Vec'}

In [3]:
# --------------- Prepare data --------------- #
y_train = np.load('y_train.npy')
y_val = np.load('y_val.npy')
# Count Vectorizer:
if embedding == 'cv':
    data = pd.read_csv('train.csv')
    X_train, X_val, _, _ = train_test_split(data['question_text'], data['target'], \
        test_size=0.1, random_state=123)
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer.fit(data['question_text'])
    X_train = count_vectorizer.transform(X_train)
    X_val = count_vectorizer.transform(X_val)
# TFIDF Vectorizer:
if embedding == 'tfidf':
    data = pd.read_csv('train.csv')
    X_train, X_val, _, _ = train_test_split(data['question_text'], data['target'], \
        test_size=0.1, random_state=123)
    tfidf_vectorizer = TfidfVectorizer(min_df=3, strip_accents='unicode', token_pattern=r'\w{1,}', \
                        ngram_range=(1, 3), sublinear_tf=1, stop_words='english')
    tfidf_vectorizer.fit(data['question_text'])
    X_train = tfidf_vectorizer.transform(X_train)
    X_val = tfidf_vectorizer.transform(X_val)
# Word2Vec:
if embedding == 'w2v':
    X_train = np.load('X_train_w2v_w5_s150.npy')
    X_val = np.load('X_val_w2v_w5_s150.npy')

In [None]:
# --------------- Find parameters with grid search --------------- #
f1_scorer = make_scorer(f1_score)
param_grid = [{'C': 1, 'class_weight': [{0:.09, 1:.91}, {0:.08, 1:.92}, \
                {0:.07, 1:.93}, {0:.06, 1:.94}, {0:.05, 1:.95}]}]
clf = GridSearchCV(LogisticRegression(), param_grid, scoring=f1_scorer) 
model = clf.fit(X_train, y_train)
labels = model.cv_results_['params']
tr_score = model.cv_results_['mean_train_score']
t_score = model.cv_results_['mean_test_score']
rcParams['figure.figsize'] = 8, 4
plt.bar(x=np.arange(len(tr_score)) - 0.2,width=0.4, height=tr_score, color='lightblue', label='train score')
plt.bar(x=np.arange(len(t_score)) + 0.2,width=0.4, height=t_score, color='blue', label='test score')
plt.title('Find best parameters with F beta score (beta=2)')
plt.xticks(np.arange(len(labels)), labels, rotation=90)
plt.ylabel('F-1 score')
plt.legend()
plt.grid()
plt.show()

In [4]:
# --------------- Logistic Regression --------------- #
model = LogisticRegression(C=1, class_weight={0:.07, 1:.93})
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

In [5]:
# --------------- Evaluate Results --------------- #
predicted_positive = np.round(np.mean(y_train_pred), 3)
y_pred_random = np.random.binomial(1, predicted_positive, y_train.shape[0])
actual_positve = np.round(np.mean(y_train), 3)
precision_train = np.round(metrics.precision_score(y_train, y_train_pred), 3)
precision_val = np.round(metrics.precision_score(y_val, y_val_pred), 3)
precision_random = np.round(metrics.precision_score(y_train, y_pred_random), 3)
recall_train = np.round(metrics.recall_score(y_train, y_train_pred), 3)
recall_val = np.round(metrics.recall_score(y_val, y_val_pred), 3)
recall_random = np.round(metrics.recall_score(y_train, y_pred_random), 3)
f1score_train = np.round(metrics.f1_score(y_train, y_train_pred), 3)
f1score_val = np.round(metrics.f1_score(y_val, y_val_pred), 3)
f1score_random = np.round(metrics.f1_score(y_train, y_pred_random), 3)
print('Evaluation Logistic Regression with {0}:'.format(embedding_dict[embedding]))
print('actual positive:    ' + str(np.round(np.mean(y_train), 3)))
print('predicted positive: ' + str(np.round(np.mean(y_train_pred), 3)))
print('precision (train/val/random): ' + str(precision_train) + ' / ' + \
      str(precision_val) + ' / ' + str(precision_random))
print('recall (train/val/random):    ' + str(recall_train) + ' / ' + \
      str(recall_val) + ' / ' + str(recall_random))
print('f1 score (train/val/random):  ' + str(f1score_train) + ' / ' + \
      str(f1score_val) + ' / ' + str(f1score_random))

Evaluation Logistic Regression with Word2Vec:
actual positive:    0.062
predicted positive: 0.207
precision (train/val/random): 0.247 / 0.244 / 0.062
recall (train/val/random):    0.827 / 0.832 / 0.206
f1 score (train/val/random):  0.38 / 0.377 / 0.095
