In [78]:
# text manipulation
import re
import string

# Data management
import pandas as pd
import numpy as np
from scipy.sparse import *
import scipy

# NLP
import nltk
import nltk.collocations as collocations
from nltk.tag import tnt
import spacy
import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Doc2Vec, FastText
from collections import defaultdict
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# sklearn
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer


# keras
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

import multiprocessing
from IPython.display import display, clear_output

%matplotlib inline

In [2]:
train = pd.read_csv('./train.csv')

In [3]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
train.shape

(1306122, 3)

In [5]:
no_insincere = train[train['target']==1].target.count()
no_sincere = train[train['target']==0].target.count()

print('No. of insincere questions:', no_insincere)
print('No. of sincere questions:', no_sincere)
print('% of insincere questions:', train.target.mean())
print('Null score:', 1- train.target.mean())

No. of insincere questions: 80810
No. of sincere questions: 1225312
% of insincere questions: 0.06187017751787352
Null score: 0.9381298224821265


In [6]:
clean_questions = (re.sub("[^A-Za-z']+", ' ', q).lower() for q in train['question_text'])

In [7]:
%%time
token_questions = [nltk.word_tokenize(q) for q in clean_questions]

Wall time: 4min 38s


In [23]:
train['token_questions'] = token_questions

In [30]:
sincere = train[train['target'] == 0]['token_questions']
insincere = train[train['target'] == 1]['token_questions']

In [19]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [28]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sincere)]

### Word2Vec embedding using training data.

In [29]:
%%time
model = Doc2Vec(documents=documents, size=300, window=5, min_count=5, workers=6)

INFO - 10:56:47: collecting all words and their counts
INFO - 10:56:47: PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO - 10:56:47: PROGRESS: at example #10000, processed 125088 words (1721621/s), 14734 word types, 10000 tags
INFO - 10:56:48: PROGRESS: at example #20000, processed 251458 words (1875446/s), 21468 word types, 20000 tags
INFO - 10:56:48: PROGRESS: at example #30000, processed 376882 words (1757512/s), 26574 word types, 30000 tags
INFO - 10:56:48: PROGRESS: at example #40000, processed 502018 words (1754863/s), 30800 word types, 40000 tags
INFO - 10:56:48: PROGRESS: at example #50000, processed 627366 words (1850810/s), 34568 word types, 50000 tags
INFO - 10:56:48: PROGRESS: at example #60000, processed 753089 words (1846247/s), 37949 word types, 60000 tags
INFO - 10:56:48: PROGRESS: at example #70000, processed 879812 words (1761590/s), 41070 word types, 70000 tags
INFO - 10:56:48: PROGRESS: at example #80000, processed 1005336 words (1766928/s

INFO - 10:56:53: PROGRESS: at example #690000, processed 8688539 words (1187057/s), 131355 word types, 690000 tags
INFO - 10:56:53: PROGRESS: at example #700000, processed 8813925 words (1756261/s), 132318 word types, 700000 tags
INFO - 10:56:53: PROGRESS: at example #710000, processed 8939503 words (1885303/s), 133303 word types, 710000 tags
INFO - 10:56:53: PROGRESS: at example #720000, processed 9065685 words (1871375/s), 134281 word types, 720000 tags
INFO - 10:56:53: PROGRESS: at example #730000, processed 9190492 words (2057046/s), 135176 word types, 730000 tags
INFO - 10:56:53: PROGRESS: at example #740000, processed 9315803 words (1857579/s), 136114 word types, 740000 tags
INFO - 10:56:53: PROGRESS: at example #750000, processed 9440730 words (1834113/s), 137086 word types, 750000 tags
INFO - 10:56:53: PROGRESS: at example #760000, processed 9566573 words (1675839/s), 137971 word types, 760000 tags
INFO - 10:56:53: PROGRESS: at example #770000, processed 9694261 words (1805813/

INFO - 10:57:50: EPOCH 1 - PROGRESS: at 7.97% examples, 73257 words/s, in_qsize 11, out_qsize 0
INFO - 10:57:51: EPOCH 1 - PROGRESS: at 8.62% examples, 73276 words/s, in_qsize 11, out_qsize 0
INFO - 10:57:52: EPOCH 1 - PROGRESS: at 9.34% examples, 73301 words/s, in_qsize 11, out_qsize 0
INFO - 10:57:53: EPOCH 1 - PROGRESS: at 9.99% examples, 73373 words/s, in_qsize 11, out_qsize 0
INFO - 10:57:54: EPOCH 1 - PROGRESS: at 10.70% examples, 74051 words/s, in_qsize 11, out_qsize 0
INFO - 10:57:55: EPOCH 1 - PROGRESS: at 11.28% examples, 73287 words/s, in_qsize 12, out_qsize 0
INFO - 10:57:56: EPOCH 1 - PROGRESS: at 11.93% examples, 73557 words/s, in_qsize 11, out_qsize 0
INFO - 10:57:57: EPOCH 1 - PROGRESS: at 12.45% examples, 73081 words/s, in_qsize 11, out_qsize 0
INFO - 10:57:58: EPOCH 1 - PROGRESS: at 13.09% examples, 73248 words/s, in_qsize 11, out_qsize 0
INFO - 10:57:59: EPOCH 1 - PROGRESS: at 13.81% examples, 73625 words/s, in_qsize 11, out_qsize 0
INFO - 10:58:00: EPOCH 1 - PROGRES

INFO - 10:59:24: EPOCH 1 - PROGRESS: at 66.38% examples, 75389 words/s, in_qsize 12, out_qsize 0
INFO - 10:59:25: EPOCH 1 - PROGRESS: at 66.97% examples, 75285 words/s, in_qsize 12, out_qsize 0
INFO - 10:59:26: EPOCH 1 - PROGRESS: at 67.54% examples, 75206 words/s, in_qsize 12, out_qsize 0
INFO - 10:59:28: EPOCH 1 - PROGRESS: at 68.06% examples, 74991 words/s, in_qsize 12, out_qsize 0
INFO - 10:59:29: EPOCH 1 - PROGRESS: at 68.65% examples, 74946 words/s, in_qsize 12, out_qsize 0
INFO - 10:59:30: EPOCH 1 - PROGRESS: at 69.23% examples, 74806 words/s, in_qsize 11, out_qsize 0
INFO - 10:59:31: EPOCH 1 - PROGRESS: at 70.01% examples, 74746 words/s, in_qsize 12, out_qsize 0
INFO - 10:59:32: EPOCH 1 - PROGRESS: at 70.72% examples, 74836 words/s, in_qsize 11, out_qsize 0
INFO - 10:59:33: EPOCH 1 - PROGRESS: at 71.18% examples, 74546 words/s, in_qsize 11, out_qsize 0
INFO - 10:59:35: EPOCH 1 - PROGRESS: at 71.95% examples, 74471 words/s, in_qsize 11, out_qsize 0
INFO - 10:59:36: EPOCH 1 - PRO

INFO - 11:00:53: EPOCH 2 - PROGRESS: at 17.63% examples, 67838 words/s, in_qsize 11, out_qsize 0
INFO - 11:00:54: EPOCH 2 - PROGRESS: at 18.41% examples, 68275 words/s, in_qsize 11, out_qsize 0
INFO - 11:00:55: EPOCH 2 - PROGRESS: at 19.19% examples, 68664 words/s, in_qsize 11, out_qsize 0
INFO - 11:00:57: EPOCH 2 - PROGRESS: at 19.96% examples, 69010 words/s, in_qsize 11, out_qsize 0
INFO - 11:00:58: EPOCH 2 - PROGRESS: at 20.73% examples, 69368 words/s, in_qsize 12, out_qsize 0
INFO - 11:00:59: EPOCH 2 - PROGRESS: at 21.44% examples, 69819 words/s, in_qsize 12, out_qsize 0
INFO - 11:01:00: EPOCH 2 - PROGRESS: at 21.96% examples, 69652 words/s, in_qsize 12, out_qsize 0
INFO - 11:01:01: EPOCH 2 - PROGRESS: at 22.67% examples, 70030 words/s, in_qsize 12, out_qsize 0
INFO - 11:01:02: EPOCH 2 - PROGRESS: at 23.45% examples, 70280 words/s, in_qsize 11, out_qsize 0
INFO - 11:01:03: EPOCH 2 - PROGRESS: at 24.15% examples, 70674 words/s, in_qsize 11, out_qsize 0
INFO - 11:01:04: EPOCH 2 - PRO

INFO - 11:02:26: EPOCH 2 - PROGRESS: at 75.56% examples, 74110 words/s, in_qsize 11, out_qsize 0
INFO - 11:02:28: EPOCH 2 - PROGRESS: at 76.33% examples, 74139 words/s, in_qsize 11, out_qsize 0
INFO - 11:02:29: EPOCH 2 - PROGRESS: at 77.11% examples, 74178 words/s, in_qsize 11, out_qsize 0
INFO - 11:02:30: EPOCH 2 - PROGRESS: at 77.88% examples, 74221 words/s, in_qsize 12, out_qsize 0
INFO - 11:02:31: EPOCH 2 - PROGRESS: at 78.66% examples, 74247 words/s, in_qsize 12, out_qsize 0
INFO - 11:02:32: EPOCH 2 - PROGRESS: at 79.43% examples, 74292 words/s, in_qsize 11, out_qsize 0
INFO - 11:02:34: EPOCH 2 - PROGRESS: at 80.20% examples, 74365 words/s, in_qsize 11, out_qsize 0
INFO - 11:02:35: EPOCH 2 - PROGRESS: at 80.92% examples, 74456 words/s, in_qsize 12, out_qsize 0
INFO - 11:02:36: EPOCH 2 - PROGRESS: at 81.64% examples, 74470 words/s, in_qsize 12, out_qsize 0
INFO - 11:02:37: EPOCH 2 - PROGRESS: at 82.28% examples, 74505 words/s, in_qsize 11, out_qsize 0
INFO - 11:02:38: EPOCH 2 - PRO

INFO - 11:03:54: EPOCH 3 - PROGRESS: at 31.08% examples, 75414 words/s, in_qsize 12, out_qsize 0
INFO - 11:03:56: EPOCH 3 - PROGRESS: at 31.87% examples, 75478 words/s, in_qsize 12, out_qsize 0
INFO - 11:03:57: EPOCH 3 - PROGRESS: at 32.64% examples, 75610 words/s, in_qsize 12, out_qsize 0
INFO - 11:03:58: EPOCH 3 - PROGRESS: at 33.41% examples, 75749 words/s, in_qsize 11, out_qsize 0
INFO - 11:03:59: EPOCH 3 - PROGRESS: at 34.19% examples, 75805 words/s, in_qsize 11, out_qsize 0
INFO - 11:04:00: EPOCH 3 - PROGRESS: at 34.90% examples, 75998 words/s, in_qsize 11, out_qsize 0
INFO - 11:04:01: EPOCH 3 - PROGRESS: at 35.55% examples, 75959 words/s, in_qsize 11, out_qsize 0
INFO - 11:04:02: EPOCH 3 - PROGRESS: at 36.20% examples, 75864 words/s, in_qsize 11, out_qsize 0
INFO - 11:04:03: EPOCH 3 - PROGRESS: at 36.92% examples, 76034 words/s, in_qsize 12, out_qsize 0
INFO - 11:04:04: EPOCH 3 - PROGRESS: at 37.63% examples, 76195 words/s, in_qsize 11, out_qsize 0
INFO - 11:04:05: EPOCH 3 - PRO

INFO - 11:05:28: EPOCH 3 - PROGRESS: at 90.88% examples, 77090 words/s, in_qsize 12, out_qsize 0
INFO - 11:05:30: EPOCH 3 - PROGRESS: at 91.54% examples, 77036 words/s, in_qsize 12, out_qsize 0
INFO - 11:05:31: EPOCH 3 - PROGRESS: at 92.25% examples, 77041 words/s, in_qsize 11, out_qsize 0
INFO - 11:05:32: EPOCH 3 - PROGRESS: at 92.90% examples, 77023 words/s, in_qsize 11, out_qsize 0
INFO - 11:05:33: EPOCH 3 - PROGRESS: at 93.54% examples, 77036 words/s, in_qsize 12, out_qsize 0
INFO - 11:05:34: EPOCH 3 - PROGRESS: at 94.19% examples, 77033 words/s, in_qsize 12, out_qsize 0
INFO - 11:05:35: EPOCH 3 - PROGRESS: at 94.83% examples, 77015 words/s, in_qsize 12, out_qsize 0
INFO - 11:05:36: EPOCH 3 - PROGRESS: at 95.48% examples, 77013 words/s, in_qsize 12, out_qsize 0
INFO - 11:05:37: EPOCH 3 - PROGRESS: at 96.20% examples, 77051 words/s, in_qsize 11, out_qsize 0
INFO - 11:05:38: EPOCH 3 - PROGRESS: at 96.85% examples, 77041 words/s, in_qsize 11, out_qsize 0
INFO - 11:05:39: EPOCH 3 - PRO

INFO - 11:06:55: EPOCH 4 - PROGRESS: at 44.63% examples, 76045 words/s, in_qsize 11, out_qsize 0
INFO - 11:06:56: EPOCH 4 - PROGRESS: at 45.28% examples, 75945 words/s, in_qsize 12, out_qsize 0
INFO - 11:06:57: EPOCH 4 - PROGRESS: at 45.98% examples, 76064 words/s, in_qsize 11, out_qsize 0
INFO - 11:06:58: EPOCH 4 - PROGRESS: at 46.62% examples, 76073 words/s, in_qsize 11, out_qsize 0
INFO - 11:06:59: EPOCH 4 - PROGRESS: at 47.33% examples, 76125 words/s, in_qsize 11, out_qsize 0
INFO - 11:07:00: EPOCH 4 - PROGRESS: at 48.05% examples, 76241 words/s, in_qsize 11, out_qsize 0
INFO - 11:07:01: EPOCH 4 - PROGRESS: at 48.63% examples, 76150 words/s, in_qsize 12, out_qsize 0
INFO - 11:07:02: EPOCH 4 - PROGRESS: at 49.28% examples, 76141 words/s, in_qsize 11, out_qsize 0
INFO - 11:07:03: EPOCH 4 - PROGRESS: at 50.05% examples, 76162 words/s, in_qsize 11, out_qsize 0
INFO - 11:07:05: EPOCH 4 - PROGRESS: at 50.83% examples, 76198 words/s, in_qsize 11, out_qsize 0
INFO - 11:07:06: EPOCH 4 - PRO

INFO - 11:08:24: worker thread finished; awaiting finish of 1 more threads
INFO - 11:08:24: worker thread finished; awaiting finish of 0 more threads
INFO - 11:08:24: EPOCH - 4 : training on 15433920 raw words (12271046 effective words) took 161.2s, 76128 effective words/s
INFO - 11:08:25: EPOCH 5 - PROGRESS: at 0.45% examples, 44573 words/s, in_qsize 11, out_qsize 0
INFO - 11:08:26: EPOCH 5 - PROGRESS: at 1.23% examples, 62068 words/s, in_qsize 11, out_qsize 0
INFO - 11:08:28: EPOCH 5 - PROGRESS: at 2.01% examples, 68638 words/s, in_qsize 11, out_qsize 0
INFO - 11:08:29: EPOCH 5 - PROGRESS: at 2.79% examples, 71544 words/s, in_qsize 12, out_qsize 0
INFO - 11:08:30: EPOCH 5 - PROGRESS: at 3.58% examples, 73064 words/s, in_qsize 12, out_qsize 0
INFO - 11:08:31: EPOCH 5 - PROGRESS: at 4.35% examples, 73117 words/s, in_qsize 11, out_qsize 0
INFO - 11:08:32: EPOCH 5 - PROGRESS: at 5.06% examples, 74842 words/s, in_qsize 12, out_qsize 0
INFO - 11:08:33: EPOCH 5 - PROGRESS: at 5.51% examples

INFO - 11:09:55: EPOCH 5 - PROGRESS: at 55.24% examples, 74494 words/s, in_qsize 12, out_qsize 0
INFO - 11:09:56: EPOCH 5 - PROGRESS: at 56.02% examples, 74586 words/s, in_qsize 11, out_qsize 0
INFO - 11:09:57: EPOCH 5 - PROGRESS: at 56.79% examples, 74695 words/s, in_qsize 11, out_qsize 0
INFO - 11:09:58: EPOCH 5 - PROGRESS: at 57.58% examples, 74789 words/s, in_qsize 11, out_qsize 0
INFO - 11:10:00: EPOCH 5 - PROGRESS: at 58.35% examples, 74883 words/s, in_qsize 12, out_qsize 0
INFO - 11:10:01: EPOCH 5 - PROGRESS: at 59.13% examples, 74943 words/s, in_qsize 11, out_qsize 0
INFO - 11:10:02: EPOCH 5 - PROGRESS: at 59.85% examples, 75049 words/s, in_qsize 11, out_qsize 0
INFO - 11:10:03: EPOCH 5 - PROGRESS: at 60.43% examples, 75014 words/s, in_qsize 11, out_qsize 0
INFO - 11:10:04: EPOCH 5 - PROGRESS: at 61.09% examples, 75045 words/s, in_qsize 11, out_qsize 0
INFO - 11:10:05: EPOCH 5 - PROGRESS: at 61.80% examples, 75075 words/s, in_qsize 11, out_qsize 0
INFO - 11:10:06: EPOCH 5 - PRO

Wall time: 14min 17s


In [None]:
%%time
# Find most similiar sentences
similar_questions = pd.DataFrame()
insincere_list = []
sincere_list = []
sincere_index = []

for n in range(0, len(insincere)):
    new_vector = model.infer_vector(insincere.iloc[n])
    similarity = model.docvecs.most_similar([new_vector])
    if n % 100 == 0:
        clear_output(wait=True)
        print(n)
        print(len(insincere_list))
    for i in similarity:
        if i[1] > 0.90:
            print('Insincere:', insincere.iloc[n])
            print(i[0])
            print('Sincere:', sincere.iloc[i[0]])
            insincere_list.append(insincere.iloc[n])
            sincere_list.append(sincere.iloc[i[0]])
            sincere_index.append(i[0])

similar_questions['insincere'] = insincere_list
similar_questions['sincere'] = sincere_list
similar_questions['sincere_index'] = sincere_index


44300
25


In [75]:
similarity = model.docvecs.most_similar([new_vector])
similarity[0][1]

0.5065800547599792

In [65]:
sincere.loc[151365]

['is', 'petrole', 'liquid', 'or', 'gas', 'at', 'room', 'temperature']

In [58]:
train.question_text[1]

'Do you have an adopted dog, how would you encourage people to adopt and not shop?'