In [1]:
import pandas as pd
import nltk
import operator
import langid
from textblob import TextBlob

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
train = pd.read_csv('./train.csv')

In [3]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
train.shape

(1306122, 3)

In [5]:
no_insincere = train[train['target']==1].target.count()
no_sincere = train[train['target']==0].target.count()

print('No. of insincere questions:', no_insincere)
print('No. of sincere questions:', no_sincere)
print('% of insincere questions:', train.target.mean())
print('Null score:', 1- train.target.mean())

No. of insincere questions: 80810
No. of sincere questions: 1225312
% of insincere questions: 0.06187017751787352
Null score: 0.9381298224821265


Combining Questions into single element

In [None]:
# Full text in one corpus
full_text = str()
for s in train['question_text']:
    full_text = full_text + s + ' '


In [None]:
full_text

In [62]:
# Full text into one list of single tokens, removing capitalisation and non-alpha values.
# Worth reconsidering capitalisation could be useful in determining insincerity.

full_text_list = [ w.lower() for s in train['question_text'] for w in nltk.word_tokenize(s) if w.isalpha()]

In [97]:
# these bigrams include end of one question and start of another
list(bigrams(full_text))

[('how', 'did'),
 ('did', 'quebec'),
 ('quebec', 'nationalists'),
 ('nationalists', 'see'),
 ('see', 'their'),
 ('their', 'province'),
 ('province', 'as'),
 ('as', 'a'),
 ('a', 'nation'),
 ('nation', 'in'),
 ('in', 'the'),
 ('the', 'do'),
 ('do', 'you'),
 ('you', 'have'),
 ('have', 'an'),
 ('an', 'adopted'),
 ('adopted', 'dog'),
 ('dog', 'how'),
 ('how', 'would'),
 ('would', 'you'),
 ('you', 'encourage'),
 ('encourage', 'people'),
 ('people', 'to'),
 ('to', 'adopt'),
 ('adopt', 'and'),
 ('and', 'not'),
 ('not', 'shop'),
 ('shop', 'why'),
 ('why', 'does'),
 ('does', 'velocity'),
 ('velocity', 'affect'),
 ('affect', 'time'),
 ('time', 'does'),
 ('does', 'velocity'),
 ('velocity', 'affect'),
 ('affect', 'space'),
 ('space', 'geometry'),
 ('geometry', 'how'),
 ('how', 'did'),
 ('did', 'otto'),
 ('otto', 'von'),
 ('von', 'guericke'),
 ('guericke', 'used'),
 ('used', 'the'),
 ('the', 'magdeburg'),
 ('magdeburg', 'hemispheres'),
 ('hemispheres', 'can'),
 ('can', 'i'),
 ('i', 'convert'),
 ('co

In [89]:
temp = [s for s in train['question_text'][:100] ]

In [95]:
nltk.Text(temp[1:100]).collocations()




In [25]:
f_dist = FreqDist([train['question_text'])
print(f_dist)

<FreqDist with 1306122 samples and 1306122 outcomes>


In [7]:
f_dist.plot(50)

<Figure size 640x480 with 1 Axes>

Collocations

In [98]:
bigram_vect = CountVectorizer(ngram_range=(2,2))
bigram_dtm = vect.fit_transform(train.question_text)

Number of lanuages

In [None]:
# lang_list = set([langid.classify(s)[0] for s in train['question_text']])


In [147]:
# Some are questions in different lanuages, some are questions regarding different lanuages.
basic_stats['no_of_lang'] = len(lang_list)
basic_stats

Unnamed: 0,no_of_questions,no_of_insincere,no_of_sincere,%_insinere,null_score,no_of_lang
basic_stats,1306122.0,80810.0,1225312.0,6.187018,0.93813,84


Prelim Models

In [8]:
X = train.question_text
y = train.target

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

#### Default count vectorizer

In [10]:
vect = CountVectorizer()
vect.fit(X_train)
raw_train_dtm = vect.transform(X_train)
raw_test_dtm = vect.transform(X_test)
raw_train_dtm

<979591x167408 sparse matrix of type '<class 'numpy.int64'>'
	with 11321413 stored elements in Compressed Sparse Row format>

In [11]:
nb = MultinomialNB()
nb.fit(raw_train_dtm, y_train)
print('train score:', nb.score(raw_train_dtm, y_train))
print('test score:', nb.score(raw_test_dtm, y_test))

train score: 0.9352168404977179
test score: 0.9342788280438917


#### Remove 'English' stop words

In [12]:
vect = CountVectorizer( stop_words='english')
vect.fit(X_train)
raw_train_dtm = vect.transform(X_train)
raw_test_dtm = vect.transform(X_test)
raw_train_dtm

<979591x167099 sparse matrix of type '<class 'numpy.int64'>'
	with 5741332 stored elements in Compressed Sparse Row format>

In [13]:
nb = MultinomialNB()
nb.fit(raw_train_dtm, y_train)
print('train score:', nb.score(raw_train_dtm, y_train))
print('test score:', nb.score(raw_test_dtm, y_test))

train score: 0.9388193644082071
test score: 0.9375036367144314


#### Bi-gram

In [14]:
vect = CountVectorizer(ngram_range=(1,2))
vect.fit(X_train)
raw_train_dtm = vect.transform(X_train)
raw_test_dtm = vect.transform(X_test)
raw_train_dtm

<979591x2617452 sparse matrix of type '<class 'numpy.int64'>'
	with 22271663 stored elements in Compressed Sparse Row format>

In [15]:
%%time
nb = MultinomialNB()
nb.fit(raw_train_dtm, y_train)
print('train score:', nb.score(raw_train_dtm, y_train))
print('test score:', nb.score(raw_test_dtm, y_test))

train score: 0.9612521960695841
test score: 0.9483540613295521
Wall time: 1.12 s


In [16]:
%%time
# dct = DecisionTreeClassifier(max_depth=20)
# dct.fit(raw_train_dtm, y_train)
# print('train score:', nb.score(raw_train_dtm, y_train))
# print('test score:', nb.score(raw_test_dtm, y_test))

train score: 0.9612521960695841
test score: 0.9483540613295521
Wall time: 6min 28s


In [17]:
%%time
# dct = DecisionTreeClassifier(max_depth=40)
# dct.fit(raw_train_dtm, y_train)
# print('train score:', nb.score(raw_train_dtm, y_train))
# print('test score:', nb.score(raw_test_dtm, y_test))

train score: 0.9612521960695841
test score: 0.9483540613295521
Wall time: 20min 26s


#### Tri-gram

In [15]:
vect = CountVectorizer(ngram_range=(1,3))
vect.fit(X_train)
raw_train_dtm = vect.transform(X_train)
raw_test_dtm = vect.transform(X_test)
raw_train_dtm

<979591x8587254 sparse matrix of type '<class 'numpy.int64'>'
	with 32284653 stored elements in Compressed Sparse Row format>

In [16]:
nb = MultinomialNB()
nb.fit(raw_train_dtm, y_train)
print('train score:', nb.score(raw_train_dtm, y_train))
print('test score:', nb.score(raw_test_dtm, y_test))

train score: 0.9786757942855743
test score: 0.9465594384606668


#### Min document frequency

In [25]:
vect = CountVectorizer(min_df = 2)
vect.fit(X_train)
raw_train_dtm = vect.transform(X_train)
raw_test_dtm = vect.transform(X_test)
raw_train_dtm

<979591x79069 sparse matrix of type '<class 'numpy.int64'>'
	with 11230041 stored elements in Compressed Sparse Row format>

In [26]:
nb = MultinomialNB()
nb.fit(raw_train_dtm, y_train)
print('train score:', nb.score(raw_train_dtm, y_train))
print('test score:', nb.score(raw_test_dtm, y_test))

train score: 0.9269725834557484
test score: 0.9244635271995615


#### Max document frequency

In [21]:
vect = CountVectorizer(max_df = 1e3)
vect.fit(X_train)
raw_train_dtm = vect.transform(X_train)
raw_test_dtm = vect.transform(X_test)
raw_train_dtm

<979591x167271 sparse matrix of type '<class 'numpy.int64'>'
	with 11318243 stored elements in Compressed Sparse Row format>

In [22]:
nb = MultinomialNB()
nb.fit(raw_train_dtm, y_train)
print('train score:', nb.score(raw_train_dtm, y_train))
print('test score:', nb.score(raw_test_dtm, y_test))

train score: 0.9353669031258964
test score: 0.9338347660712153
