In [1]:
# text manipulation
import re
import string

# Data management
import pandas as pd
import numpy as np
from scipy.sparse import *
import scipy

# NLP
import nltk
import nltk.collocations as collocations
from nltk.tag import tnt
import spacy
import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from collections import defaultdict
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)


# modelling
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report
from sklearn.cluster import MeanShift


#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

import multiprocessing

%matplotlib inline

ModuleNotFoundError: No module named 'spacy'

In [6]:
train = pd.read_csv('./train.csv')

In [7]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [8]:
train.shape

(1306122, 3)

In [9]:
no_insincere = train[train['target']==1].target.count()
no_sincere = train[train['target']==0].target.count()

print('No. of insincere questions:', no_insincere)
print('No. of sincere questions:', no_sincere)
print('% of insincere questions:', train.target.mean())
print('Null score:', 1- train.target.mean())

No. of insincere questions: 80810
No. of sincere questions: 1225312
% of insincere questions: 0.06187017751787352
Null score: 0.9381298224821265


In [10]:
clean_questions = (re.sub("[^A-Za-z']+", ' ', q).lower() for q in train['question_text'])

In [11]:
stopwords = list(nltk.corpus.stopwords.words('english'))

In [12]:
%%time
# remove stop words and lower all characters
clean_questions = [' '.join(w for w in nltk.word_tokenize(q.lower()) if w not in stopwords) for q in clean_questions]

Wall time: 2min 25s


In [13]:
df = pd.DataFrame()
df['clean_question'] = clean_questions
df['target'] = train.target

In [31]:
token_list = [ q.split() for q in clean_questions]

In [32]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [33]:
w2v = Word2Vec(min_count=5,
                    window=4,
                     workers=cores-1)

In [34]:
w2v.build_vocab(token_list, progress_per=100000)

INFO - 00:01:38: collecting all words and their counts
INFO - 00:01:38: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 00:01:38: PROGRESS: at sentence #100000, processed 650623 words, keeping 49387 word types
INFO - 00:01:38: PROGRESS: at sentence #200000, processed 1302394 words, keeping 70054 word types
INFO - 00:01:38: PROGRESS: at sentence #300000, processed 1953757 words, keeping 85849 word types
INFO - 00:01:39: PROGRESS: at sentence #400000, processed 2606613 words, keeping 99390 word types
INFO - 00:01:39: PROGRESS: at sentence #500000, processed 3256576 words, keeping 111596 word types
INFO - 00:01:39: PROGRESS: at sentence #600000, processed 3908891 words, keeping 122456 word types
INFO - 00:01:39: PROGRESS: at sentence #700000, processed 4561044 words, keeping 132503 word types
INFO - 00:01:39: PROGRESS: at sentence #800000, processed 5209772 words, keeping 141886 word types
INFO - 00:01:39: PROGRESS: at sentence #900000, processed 5862606 words, ke

In [35]:
words = list(w2v.wv.vocab)
len(words)

48636

In [36]:
w2v.wv.most_similar('horrible')

INFO - 00:01:44: precomputing L2-norms of word weight vectors


[('parallely', 0.42080217599868774),
 ('witch', 0.3682255148887634),
 ('strictest', 0.36116117238998413),
 ('roof', 0.3587818145751953),
 ('limbaugh', 0.3473680019378662),
 ('devise', 0.3449663519859314),
 ('structured', 0.34145742654800415),
 ('tended', 0.3388858437538147),
 ('corned', 0.3384595215320587),
 ('envelope', 0.33783653378486633)]

In [39]:
w2v.train(token_list, total_examples=w2v.corpus_count, epochs=30, report_delay=1)

INFO - 00:02:19: training model with 11 workers on 48636 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=4
INFO - 00:02:20: EPOCH 1 - PROGRESS: at 16.46% examples, 1314225 words/s, in_qsize 19, out_qsize 2
INFO - 00:02:21: EPOCH 1 - PROGRESS: at 33.12% examples, 1328802 words/s, in_qsize 20, out_qsize 2
INFO - 00:02:22: EPOCH 1 - PROGRESS: at 50.74% examples, 1357275 words/s, in_qsize 22, out_qsize 0
INFO - 00:02:23: EPOCH 1 - PROGRESS: at 67.78% examples, 1354294 words/s, in_qsize 22, out_qsize 4
INFO - 00:02:24: EPOCH 1 - PROGRESS: at 86.11% examples, 1378127 words/s, in_qsize 21, out_qsize 0
INFO - 00:02:25: worker thread finished; awaiting finish of 10 more threads
INFO - 00:02:25: worker thread finished; awaiting finish of 9 more threads
INFO - 00:02:25: worker thread finished; awaiting finish of 8 more threads
INFO - 00:02:25: worker thread finished; awaiting finish of 7 more threads
INFO - 00:02:25: worker thread finished; awaiting finish of 6 more th

INFO - 00:03:27: EPOCH 12 - PROGRESS: at 68.26% examples, 1366017 words/s, in_qsize 21, out_qsize 0
INFO - 00:03:29: EPOCH 12 - PROGRESS: at 85.75% examples, 1374119 words/s, in_qsize 21, out_qsize 0
INFO - 00:03:29: worker thread finished; awaiting finish of 10 more threads
INFO - 00:03:29: worker thread finished; awaiting finish of 9 more threads
INFO - 00:03:29: worker thread finished; awaiting finish of 8 more threads
INFO - 00:03:29: worker thread finished; awaiting finish of 7 more threads
INFO - 00:03:29: worker thread finished; awaiting finish of 6 more threads
INFO - 00:03:29: worker thread finished; awaiting finish of 5 more threads
INFO - 00:03:29: worker thread finished; awaiting finish of 4 more threads
INFO - 00:03:29: worker thread finished; awaiting finish of 3 more threads
INFO - 00:03:29: worker thread finished; awaiting finish of 2 more threads
INFO - 00:03:29: worker thread finished; awaiting finish of 1 more threads
INFO - 00:03:29: worker thread finished; awaiting

INFO - 00:04:34: worker thread finished; awaiting finish of 9 more threads
INFO - 00:04:34: worker thread finished; awaiting finish of 8 more threads
INFO - 00:04:34: worker thread finished; awaiting finish of 7 more threads
INFO - 00:04:34: worker thread finished; awaiting finish of 6 more threads
INFO - 00:04:34: worker thread finished; awaiting finish of 5 more threads
INFO - 00:04:34: worker thread finished; awaiting finish of 4 more threads
INFO - 00:04:34: worker thread finished; awaiting finish of 3 more threads
INFO - 00:04:34: worker thread finished; awaiting finish of 2 more threads
INFO - 00:04:34: worker thread finished; awaiting finish of 1 more threads
INFO - 00:04:34: worker thread finished; awaiting finish of 0 more threads
INFO - 00:04:34: EPOCH - 23 : training on 8509118 raw words (8069553 effective words) took 5.8s, 1393833 effective words/s
INFO - 00:04:35: EPOCH 24 - PROGRESS: at 16.45% examples, 1316166 words/s, in_qsize 21, out_qsize 0
INFO - 00:04:36: EPOCH 24 -

(242082083, 255273540)

In [40]:
w2v.init_sims(replace=True) #increase memory efficiency

INFO - 00:05:14: precomputing L2-norms of word weight vectors


In [41]:
w2v_model.syn0norm[wv.vocab[word].index]

NameError: name 'w2v_model' is not defined

In [None]:
%%time

# model pre-trained by Google
wv_google = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
wv_google.init_sims(replace=True)

In [18]:
wv_google.wv.most_similar('horrible')

  """Entry point for launching an IPython kernel.


[('terrible', 0.92439204454422),
 ('horrendous', 0.8412425518035889),
 ('dreadful', 0.770708441734314),
 ('horrid', 0.7669703960418701),
 ('awful', 0.7597668766975403),
 ('atrocious', 0.7203570604324341),
 ('horrific', 0.7141486406326294),
 ('horrible_horrible', 0.7102487087249756),
 ('hideous', 0.6976836323738098),
 ('appalling', 0.6743218898773193)]

In [42]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [43]:
%%time
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train, test = train_test_split(df, random_state = 42)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['clean_question']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['clean_question']), axis=1).values

X_train_word_average = word_averaging_list(w2v.wv,train_tokenized)
X_test_word_average = word_averaging_list(w2v.wv,test_tokenized)

  






























Wall time: 3min 43s


In [44]:
%%time
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['target'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.target))
print(classification_report(test.target, y_pred))
print(f1_score(test.target, y_pred))



accuracy 0.9489175606603967
              precision    recall  f1-score   support

           0       0.96      0.99      0.97    306592
           1       0.65      0.36      0.46     19939

   micro avg       0.95      0.95      0.95    326531
   macro avg       0.80      0.67      0.72    326531
weighted avg       0.94      0.95      0.94    326531

0.46245568804382853
Wall time: 28.2 s


In [14]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re

In [15]:


def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    return labeled
X_train, X_test, y_train, y_test = train_test_split(df.clean_question, df.target, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [16]:
all_data[:2]

[TaggedDocument(words=['financial', 'companies', 'optimise', 'showing', 'prices', 'websites', 'consumers', 'fluctuate', 'lot'], tags=['Train_0']),
 TaggedDocument(words=['different', 'ways', 'list', 'products', 'multiple', 'woocommerce', 'websites', 'without', 'adding', 'products', 'one', 'website', 'time'], tags=['Train_1'])]

In [23]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha
    epoch += 1

100%|██████████| 1306122/1306122 [00:01<00:00, 1003946.26it/s]
INFO - 23:27:17: collecting all words and their counts
INFO - 23:27:17: PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO - 23:27:17: PROGRESS: at example #10000, processed 65137 words (855948/s), 14574 word types, 10000 tags
INFO - 23:27:17: PROGRESS: at example #20000, processed 130091 words (1546855/s), 21482 word types, 20000 tags
INFO - 23:27:17: PROGRESS: at example #30000, processed 195287 words (1625033/s), 26572 word types, 30000 tags
INFO - 23:27:17: PROGRESS: at example #40000, processed 260678 words (1555517/s), 30734 word types, 40000 tags
INFO - 23:27:17: PROGRESS: at example #50000, processed 326013 words (1522655/s), 34486 word types, 50000 tags
INFO - 23:27:17: PROGRESS: at example #60000, processed 391699 words (1556459/s), 37876 word types, 60000 tags
INFO - 23:27:17: PROGRESS: at example #70000, processed 456987 words (1450187/s), 41053 word types, 70000 tags
INFO - 23:27:17: PR

INFO - 23:27:50: EPOCH 1 - PROGRESS: at 7.19% examples, 162389 words/s, in_qsize 5, out_qsize 0
INFO - 23:27:51: EPOCH 1 - PROGRESS: at 8.95% examples, 163497 words/s, in_qsize 6, out_qsize 0
INFO - 23:27:52: EPOCH 1 - PROGRESS: at 10.81% examples, 165607 words/s, in_qsize 6, out_qsize 0
INFO - 23:27:53: EPOCH 1 - PROGRESS: at 12.80% examples, 166929 words/s, in_qsize 6, out_qsize 0
INFO - 23:27:54: EPOCH 1 - PROGRESS: at 14.69% examples, 167455 words/s, in_qsize 6, out_qsize 0
INFO - 23:27:55: EPOCH 1 - PROGRESS: at 16.68% examples, 169723 words/s, in_qsize 5, out_qsize 0
INFO - 23:27:56: EPOCH 1 - PROGRESS: at 18.56% examples, 169352 words/s, in_qsize 5, out_qsize 0
INFO - 23:27:57: EPOCH 1 - PROGRESS: at 20.45% examples, 170066 words/s, in_qsize 5, out_qsize 0
INFO - 23:27:58: EPOCH 1 - PROGRESS: at 22.33% examples, 170704 words/s, in_qsize 5, out_qsize 0
INFO - 23:27:59: EPOCH 1 - PROGRESS: at 24.20% examples, 170550 words/s, in_qsize 5, out_qsize 0
INFO - 23:28:00: EPOCH 1 - PROGR

INFO - 23:30:31: EPOCH - 1 : training on 8509118 raw words (9581028 effective words) took 54.0s, 177467 effective words/s
INFO - 23:30:31: training on a 8509118 raw words (9581028 effective words) took 54.0s, 177441 effective words/s
100%|██████████| 1306122/1306122 [00:00<00:00, 4030907.03it/s]
INFO - 23:30:32: training model with 3 workers on 182747 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
INFO - 23:30:33: EPOCH 1 - PROGRESS: at 1.89% examples, 157398 words/s, in_qsize 5, out_qsize 0
INFO - 23:30:34: EPOCH 1 - PROGRESS: at 3.77% examples, 166359 words/s, in_qsize 5, out_qsize 0
INFO - 23:30:35: EPOCH 1 - PROGRESS: at 5.65% examples, 169582 words/s, in_qsize 6, out_qsize 0
INFO - 23:30:36: EPOCH 1 - PROGRESS: at 7.63% examples, 170086 words/s, in_qsize 5, out_qsize 0
INFO - 23:30:37: EPOCH 1 - PROGRESS: at 9.51% examples, 171918 words/s, in_qsize 5, out_qsize 0
INFO - 23:30:38: EPOCH 1 - PROGRESS: at 11.39% examples, 172282 words/s, in_qsize 5, out

INFO - 23:33:12: EPOCH 1 - PROGRESS: at 86.21% examples, 172802 words/s, in_qsize 6, out_qsize 0
INFO - 23:33:13: EPOCH 1 - PROGRESS: at 88.08% examples, 172775 words/s, in_qsize 5, out_qsize 0
INFO - 23:33:14: EPOCH 1 - PROGRESS: at 90.07% examples, 172898 words/s, in_qsize 5, out_qsize 0
INFO - 23:33:15: EPOCH 1 - PROGRESS: at 91.84% examples, 172761 words/s, in_qsize 5, out_qsize 0
INFO - 23:33:16: EPOCH 1 - PROGRESS: at 93.72% examples, 172860 words/s, in_qsize 5, out_qsize 0
INFO - 23:33:17: EPOCH 1 - PROGRESS: at 95.61% examples, 172847 words/s, in_qsize 6, out_qsize 0
INFO - 23:33:18: EPOCH 1 - PROGRESS: at 97.37% examples, 172736 words/s, in_qsize 5, out_qsize 0
INFO - 23:33:19: EPOCH 1 - PROGRESS: at 99.26% examples, 172782 words/s, in_qsize 5, out_qsize 0
INFO - 23:33:20: worker thread finished; awaiting finish of 2 more threads
INFO - 23:33:20: worker thread finished; awaiting finish of 1 more threads
INFO - 23:33:20: worker thread finished; awaiting finish of 0 more threads

INFO - 23:35:51: EPOCH 1 - PROGRESS: at 70.95% examples, 175127 words/s, in_qsize 6, out_qsize 0
INFO - 23:35:52: EPOCH 1 - PROGRESS: at 72.95% examples, 175348 words/s, in_qsize 5, out_qsize 0
INFO - 23:35:53: EPOCH 1 - PROGRESS: at 74.82% examples, 175113 words/s, in_qsize 6, out_qsize 0
INFO - 23:35:54: EPOCH 1 - PROGRESS: at 76.69% examples, 175212 words/s, in_qsize 5, out_qsize 0
INFO - 23:35:55: EPOCH 1 - PROGRESS: at 78.57% examples, 175315 words/s, in_qsize 6, out_qsize 0
INFO - 23:35:56: EPOCH 1 - PROGRESS: at 80.45% examples, 175147 words/s, in_qsize 6, out_qsize 0
INFO - 23:35:57: EPOCH 1 - PROGRESS: at 82.34% examples, 175178 words/s, in_qsize 6, out_qsize 0
INFO - 23:35:58: EPOCH 1 - PROGRESS: at 84.22% examples, 175269 words/s, in_qsize 5, out_qsize 0
INFO - 23:35:59: EPOCH 1 - PROGRESS: at 85.98% examples, 175097 words/s, in_qsize 5, out_qsize 0
INFO - 23:36:00: EPOCH 1 - PROGRESS: at 87.86% examples, 175139 words/s, in_qsize 5, out_qsize 0
INFO - 23:36:01: EPOCH 1 - PRO

INFO - 23:37:08: EPOCH 1 - PROGRESS: at 7.51% examples, 168533 words/s, in_qsize 5, out_qsize 0
INFO - 23:37:09: EPOCH 1 - PROGRESS: at 9.50% examples, 171123 words/s, in_qsize 5, out_qsize 0
INFO - 23:37:10: EPOCH 1 - PROGRESS: at 11.38% examples, 170773 words/s, in_qsize 5, out_qsize 0
INFO - 23:37:11: EPOCH 1 - PROGRESS: at 13.24% examples, 170935 words/s, in_qsize 5, out_qsize 0
INFO - 23:37:12: EPOCH 1 - PROGRESS: at 15.13% examples, 171856 words/s, in_qsize 5, out_qsize 0
INFO - 23:37:13: EPOCH 1 - PROGRESS: at 17.00% examples, 172302 words/s, in_qsize 5, out_qsize 0
INFO - 23:37:14: EPOCH 1 - PROGRESS: at 18.76% examples, 171388 words/s, in_qsize 5, out_qsize 0
INFO - 23:37:15: EPOCH 1 - PROGRESS: at 20.64% examples, 171812 words/s, in_qsize 5, out_qsize 0
INFO - 23:37:16: EPOCH 1 - PROGRESS: at 22.53% examples, 171788 words/s, in_qsize 6, out_qsize 0
INFO - 23:37:17: EPOCH 1 - PROGRESS: at 24.30% examples, 171424 words/s, in_qsize 5, out_qsize 0
INFO - 23:37:18: EPOCH 1 - PROGR

INFO - 23:39:51: EPOCH 1 - PROGRESS: at 99.51% examples, 175052 words/s, in_qsize 5, out_qsize 0
INFO - 23:39:51: worker thread finished; awaiting finish of 2 more threads
INFO - 23:39:51: worker thread finished; awaiting finish of 1 more threads
INFO - 23:39:51: worker thread finished; awaiting finish of 0 more threads
INFO - 23:39:51: EPOCH - 1 : training on 8509118 raw words (9581420 effective words) took 54.6s, 175335 effective words/s
INFO - 23:39:51: training on a 8509118 raw words (9581420 effective words) took 54.7s, 175292 effective words/s
100%|██████████| 1306122/1306122 [00:00<00:00, 4044121.94it/s]
INFO - 23:39:52: training model with 3 workers on 182747 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
INFO - 23:39:53: EPOCH 1 - PROGRESS: at 1.88% examples, 154555 words/s, in_qsize 5, out_qsize 0
INFO - 23:39:54: EPOCH 1 - PROGRESS: at 3.89% examples, 171555 words/s, in_qsize 5, out_qsize 0
INFO - 23:39:55: EPOCH 1 - PROGRESS: at 5.66% examples

INFO - 23:42:29: EPOCH 1 - PROGRESS: at 83.84% examples, 173253 words/s, in_qsize 5, out_qsize 0
INFO - 23:42:30: EPOCH 1 - PROGRESS: at 85.73% examples, 173377 words/s, in_qsize 5, out_qsize 0
INFO - 23:42:31: EPOCH 1 - PROGRESS: at 87.61% examples, 173431 words/s, in_qsize 5, out_qsize 0
INFO - 23:42:32: EPOCH 1 - PROGRESS: at 89.38% examples, 173338 words/s, in_qsize 6, out_qsize 0
INFO - 23:42:33: EPOCH 1 - PROGRESS: at 91.26% examples, 173315 words/s, in_qsize 6, out_qsize 0
INFO - 23:42:34: EPOCH 1 - PROGRESS: at 93.26% examples, 173268 words/s, in_qsize 5, out_qsize 0
INFO - 23:42:35: EPOCH 1 - PROGRESS: at 95.13% examples, 173372 words/s, in_qsize 5, out_qsize 0
INFO - 23:42:36: EPOCH 1 - PROGRESS: at 96.91% examples, 173280 words/s, in_qsize 5, out_qsize 0
INFO - 23:42:37: EPOCH 1 - PROGRESS: at 98.79% examples, 173395 words/s, in_qsize 5, out_qsize 0
INFO - 23:42:38: worker thread finished; awaiting finish of 2 more threads
INFO - 23:42:38: worker thread finished; awaiting fi

INFO - 23:45:08: EPOCH 1 - PROGRESS: at 65.45% examples, 172700 words/s, in_qsize 5, out_qsize 0
INFO - 23:45:09: EPOCH 1 - PROGRESS: at 67.45% examples, 172961 words/s, in_qsize 5, out_qsize 0
INFO - 23:45:10: EPOCH 1 - PROGRESS: at 69.21% examples, 172761 words/s, in_qsize 5, out_qsize 0
INFO - 23:45:11: EPOCH 1 - PROGRESS: at 71.09% examples, 172732 words/s, in_qsize 5, out_qsize 0
INFO - 23:45:12: EPOCH 1 - PROGRESS: at 72.97% examples, 172886 words/s, in_qsize 5, out_qsize 0
INFO - 23:45:13: EPOCH 1 - PROGRESS: at 74.96% examples, 172727 words/s, in_qsize 6, out_qsize 0
INFO - 23:45:14: EPOCH 1 - PROGRESS: at 76.95% examples, 172828 words/s, in_qsize 6, out_qsize 0
INFO - 23:45:15: EPOCH 1 - PROGRESS: at 78.83% examples, 172874 words/s, in_qsize 5, out_qsize 0
INFO - 23:45:16: EPOCH 1 - PROGRESS: at 80.71% examples, 172822 words/s, in_qsize 6, out_qsize 0
INFO - 23:45:17: EPOCH 1 - PROGRESS: at 82.59% examples, 172975 words/s, in_qsize 5, out_qsize 0
INFO - 23:45:18: EPOCH 1 - PRO

INFO - 23:47:45: EPOCH 1 - PROGRESS: at 49.58% examples, 173976 words/s, in_qsize 6, out_qsize 0
INFO - 23:47:46: EPOCH 1 - PROGRESS: at 51.47% examples, 174054 words/s, in_qsize 6, out_qsize 0
INFO - 23:47:47: EPOCH 1 - PROGRESS: at 53.34% examples, 174038 words/s, in_qsize 5, out_qsize 0
INFO - 23:47:49: EPOCH 1 - PROGRESS: at 55.33% examples, 173851 words/s, in_qsize 5, out_qsize 0
INFO - 23:47:50: EPOCH 1 - PROGRESS: at 57.45% examples, 173945 words/s, in_qsize 5, out_qsize 0
INFO - 23:47:51: EPOCH 1 - PROGRESS: at 59.45% examples, 174268 words/s, in_qsize 5, out_qsize 0
INFO - 23:47:52: EPOCH 1 - PROGRESS: at 61.32% examples, 174056 words/s, in_qsize 6, out_qsize 0
INFO - 23:47:53: EPOCH 1 - PROGRESS: at 63.21% examples, 173815 words/s, in_qsize 5, out_qsize 0
INFO - 23:47:54: EPOCH 1 - PROGRESS: at 65.21% examples, 174188 words/s, in_qsize 5, out_qsize 0
INFO - 23:47:55: EPOCH 1 - PROGRESS: at 67.09% examples, 174264 words/s, in_qsize 5, out_qsize 0
INFO - 23:47:56: EPOCH 1 - PRO

INFO - 23:50:24: EPOCH 1 - PROGRESS: at 30.59% examples, 173771 words/s, in_qsize 5, out_qsize 0
INFO - 23:50:25: EPOCH 1 - PROGRESS: at 32.58% examples, 174564 words/s, in_qsize 6, out_qsize 0
INFO - 23:50:26: EPOCH 1 - PROGRESS: at 34.33% examples, 174199 words/s, in_qsize 6, out_qsize 0
INFO - 23:50:27: EPOCH 1 - PROGRESS: at 36.32% examples, 174226 words/s, in_qsize 5, out_qsize 0
INFO - 23:50:28: EPOCH 1 - PROGRESS: at 38.31% examples, 174654 words/s, in_qsize 5, out_qsize 0
INFO - 23:50:29: EPOCH 1 - PROGRESS: at 40.20% examples, 174841 words/s, in_qsize 6, out_qsize 0
INFO - 23:50:30: EPOCH 1 - PROGRESS: at 42.08% examples, 174292 words/s, in_qsize 6, out_qsize 0
INFO - 23:50:31: EPOCH 1 - PROGRESS: at 44.08% examples, 174446 words/s, in_qsize 6, out_qsize 0
INFO - 23:50:32: EPOCH 1 - PROGRESS: at 45.95% examples, 174633 words/s, in_qsize 5, out_qsize 0
INFO - 23:50:33: EPOCH 1 - PROGRESS: at 47.72% examples, 174412 words/s, in_qsize 5, out_qsize 0
INFO - 23:50:34: EPOCH 1 - PRO

INFO - 23:53:03: EPOCH 1 - PROGRESS: at 16.68% examples, 171182 words/s, in_qsize 6, out_qsize 0
INFO - 23:53:04: EPOCH 1 - PROGRESS: at 18.55% examples, 171987 words/s, in_qsize 6, out_qsize 0
INFO - 23:53:05: EPOCH 1 - PROGRESS: at 20.44% examples, 171689 words/s, in_qsize 5, out_qsize 0
INFO - 23:53:06: EPOCH 1 - PROGRESS: at 22.43% examples, 171649 words/s, in_qsize 6, out_qsize 0
INFO - 23:53:07: EPOCH 1 - PROGRESS: at 24.43% examples, 172175 words/s, in_qsize 5, out_qsize 0
INFO - 23:53:08: EPOCH 1 - PROGRESS: at 26.30% examples, 172568 words/s, in_qsize 6, out_qsize 0
INFO - 23:53:09: EPOCH 1 - PROGRESS: at 28.06% examples, 172214 words/s, in_qsize 6, out_qsize 0
INFO - 23:53:10: EPOCH 1 - PROGRESS: at 30.07% examples, 172036 words/s, in_qsize 5, out_qsize 0
INFO - 23:53:11: EPOCH 1 - PROGRESS: at 31.95% examples, 172136 words/s, in_qsize 6, out_qsize 0
INFO - 23:53:12: EPOCH 1 - PROGRESS: at 33.83% examples, 172362 words/s, in_qsize 6, out_qsize 0
INFO - 23:53:13: EPOCH 1 - PRO

INFO - 23:55:44: EPOCH - 1 : training on 8509118 raw words (9580236 effective words) took 55.5s, 172513 effective words/s
INFO - 23:55:44: training on a 8509118 raw words (9580236 effective words) took 55.5s, 172481 effective words/s


In [24]:

def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [25]:
%%time
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors_dbow, y_train)
logreg = logreg.fit(train_vectors_dbow, y_train)
y_pred = logreg.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))



accuracy 0.9448137873656648
              precision    recall  f1-score   support

           0       0.95      0.99      0.97    367711
           1       0.62      0.27      0.38     24126

   micro avg       0.94      0.94      0.94    391837
   macro avg       0.79      0.63      0.67    391837
weighted avg       0.93      0.94      0.93    391837

Wall time: 2min 9s
