### Quora Insincere Questions: Train word2vec

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
import os
os.chdir('/home/roman/Documents/Projects/Quora/data')

In [2]:
data = pd.read_csv('train.csv')
X_train, X_val, y_train, y_val = train_test_split(data['question_text'], data['target'], test_size=0.1) # 2%

In [None]:
# --------------- train word2vec --------------- #
# tokenization
texts = [[word for word in document.lower().split()] for document in data['question_text']]

os.chdir('../models')
path = get_tmpfile("word2vec.model")
model = Word2Vec(size=100, window=5, min_count=1, workers=4)
model.build_vocab(texts, update=False)
model.train(texts, total_examples=model.corpus_count, epochs=1)

# save and load model:
model.save("word2vec.model")

In [3]:
# load model:
os.chdir('../models')
model = Word2Vec.load("word2vec.model")
w2v = dict(zip(model.wv.index2word, model.wv.syn0))   # <- create dictionary (9%)

In [12]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(word2vec[next(iter(word2vec))])
    def transform(self, X_all):
        for X in X_all:
            yield np.array(
                np.mean([self.word2vec[w] if w in self.word2vec else np.zeros(self.dim) for w in X], axis=0))

In [15]:
vectorizer = MeanEmbeddingVectorizer(w2v)
X_train_vec = vectorizer.transform(X_train)
X_val_vec = vectorizer.transform(X_val)

<generator object MeanEmbeddingVectorizer.transform at 0x7f77ac9f99e8>

In [14]:
clf = LogisticRegression()
clf.fit(X_train_vec, y_train)
y_train_w2v_pred = clf.predict(X_train_vec)
y_val_w2v_pred = clf.predict(X_val_vec)

TypeError: float() argument must be a string or a number, not 'generator'

In [9]:
# --------------- Evaluate Results --------------- #
predicted_positive = np.round(np.mean(y_train_w2v_pred), 3)
y_pred_random = np.random.binomial(1, predicted_positive, y_train.shape[0])
actual_positve = np.round(np.mean(y_train), 3)
precision_train = np.round(metrics.precision_score(y_train, y_train_w2v_pred), 3)
precision_val = np.round(metrics.precision_score(y_val, y_val_w2v_pred), 3)
precision_random = np.round(metrics.precision_score(y_train, y_pred_random), 3)
recall_train = np.round(metrics.recall_score(y_train, y_train_w2v_pred), 3)
recall_val = np.round(metrics.recall_score(y_val, y_val_w2v_pred), 3)
recall_random = np.round(metrics.recall_score(y_train, y_pred_random), 3)
f1score_train = np.round(metrics.f1_score(y_train, y_train_w2v_pred), 3)
f1score_val = np.round(metrics.f1_score(y_val, y_val_w2v_pred), 3)
f1score_random = np.round(metrics.f1_score(y_train, y_pred_random), 3)
print('Evaluation Logistic Regression with Word2Vec:')
print('actual positive:    ' + str(np.round(np.mean(y_train), 3)))
print('predicted positive: ' + str(np.round(np.mean(y_train_w2v_pred), 3)))
print('precision (train/val/random): ' + str(precision_train) + ' / ' + \
      str(precision_val) + ' / ' + str(precision_random))
print('recall (train/val/random):    ' + str(recall_train) + ' / ' + \
      str(recall_val) + ' / ' + str(recall_random))
print('f1 score (train/val/random):  ' + str(f1score_train) + ' / ' + \
      str(f1score_val) + ' / ' + str(f1score_random))

Evaluation Logistic Regression with Word2Vec:
actual positive:    0.062
predicted positive: 0.411
precision (train/val/random): 0.11 / 0.11 / 0.061
recall (train/val/random):    0.728 / 0.723 / 0.408
f1 score (train/val/random):  0.19 / 0.191 / 0.107


In [None]:
y_train_w2v_pred.sum()
