In [1]:
# text manipulation
import re
import string

# Data management
import pandas as pd
import numpy as np
from scipy.sparse import *
import scipy

# NLP
import nltk
import nltk.collocations as collocations
from nltk.tag import tnt
import spacy

# modelling
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import f1_score, confusion_matrix

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
train = pd.read_csv('./train.csv')

In [3]:
no_insincere = train[train['target']==1].target.count()
no_sincere = train[train['target']==0].target.count()

print('No. of insincere questions:', no_insincere)
print('No. of sincere questions:', no_sincere)
print('% of insincere questions:', train.target.mean())
print('Null score:', 1- train.target.mean())

No. of insincere questions: 80810
No. of sincere questions: 1225312
% of insincere questions: 0.06187017751787352
Null score: 0.9381298224821265


# Define functions and pipelines

In [4]:
def vect_trans(vectorizer, X_train, X_test):
    # can also take a transformer
    vect = vectorizer
    vect.fit(X_train)
    return vect.transform(X_train), vect.transform(X_test)

In [5]:
# MultinominalNB function for printing scores and storing into df.
def model_score(model, X_train, X_test, y_train, y_test, score_df, model_label):
    estimator = model
    estimator.fit(X_train, y_train)
    test_score =  estimator.score(X_test, y_test)
    f1 = f1_score(y_test, estimator.predict(X_test))
    
    print('Train Accuracy :', estimator.score(X_train, y_train))
    print('Test Accuracy:', test_score)
    print('Test F1 score:', f1)
    score_df.loc[model_label, 'Test_Accuracy'] = test_score
    score_df.loc[model_label, 'Test_F1_score'] = f1

In [6]:
# Cross Validate function for printing scores and storing into df.
def cv_score(model, X, y, model_label,  cv=5, ):    
    
    # instantiating model
    estimator = model
    
    cv_result = cross_validate(estimator, X, y, cv = cv, n_jobs=-1, scoring=['accuracy', 'f1'])
    
    print('Test Accuracy Mean:',cv_result['test_accuracy'].mean())
    print('Test Accuracy STD:',cv_result['test_accuracy'].std())
    print('Test F1:', cv_result['test_f1'].mean())
    score_df.loc[model_label, 'CV_Accuracy'] = cv_result['test_accuracy'].mean()
    score_df.loc[model_label, 'CV_Acc_STD'] = cv_result['test_accuracy'].std()
    score_df.loc[model_label, 'CV_F1_score'] = cv_result['test_f1'].mean()

In [7]:
# GridSearchCV function, auto display best score and parameters and storing in df
def gridcv(model, X, y, params, cv= 5 ):
    
    # instantiating model can also be a pipeline
    estimator = model
    
    gridcv = GridSearchCV(estimator=estimator, param_grid=params, cv = cv, verbose=10, n_jobs=6)
    gridcv.fit(X, y)
    
    print(gridcv.best_params_)
    print(gridcv.best_score_)
    

In [8]:
stopwords = list(nltk.corpus.stopwords.words('english')) + list(string.punctuation) + ["''", '``','’', "'s", "'d", "'ll", "'t"]

In [9]:
# CountVectorizer pipeline and parameters
pipeCVNB = Pipeline([('CV',CountVectorizer(stop_words=stopwords)), 
                    ('NB',MultinomialNB())])

paramsCVNB = {'CV__max_df':(1.0, 0.9, 0.8, 0.7),
       'CV__min_df': (1, 2, 0.01 , 0.1, 0.2),
         'CV__ngram_range':((1,1), (1,2), (1,3))}

In [10]:
# TfidfVectorizer pipeline and parameters
pipeTVNB = Pipeline([('TV',TfidfVectorizer(stop_words=stopwords)), 
                    ('NB',MultinomialNB())])

paramsTVNB = {'TV__max_df':(1.0, 0.9, 0.8, 0.7, 0.6),
       'TV__min_df': (1, 2, 0.01, 0.05, 0.1),
         'TV__ngram_range':((1,1), (1,2), (1,3), (2,2), (2,3))}

In [11]:
score_df = pd.DataFrame()

# Lemmatization

In [14]:
# using spaCy to lemmatize using POS tags in one step, with out converting between WordNet and Treebank tags, using NLTK
spac = spacy.load('en', disable=['parser', 'ner'])
def lemmatizer(text):
    text = spac(text)
    return ' '.join([token.lemma_ for token in text if token.lemma_ not in stopwords])

In [23]:
%%time
lemma_q = [lemmatizer(q) for q in train.question_text]

Wall time: 48min 33s


In [37]:
lemma_train = pd.DataFrame(lemma_q, columns = ['question_text'])
lemma_train['target'] = train.target
lemma_train

Unnamed: 0,question_text,target
0,quebec nationalist see -PRON- province nation ...,0
1,-PRON- adopt dog would -PRON- encourage people...,0
2,velocity affect time velocity affect space geo...,0
3,otto von guericke use magdeburg hemisphere,0
4,-PRON- convert montra helicon mountain bike ch...,0
5,gaza slowly become auschwitz dachau treblinka ...,0
6,quora automatically ban conservative opinion r...,0
7,-PRON- crazy -PRON- wash wipe -PRON- grocery g...,0
8,thing dress moderately different dress modestly,0
9,-PRON- -PRON- -PRON- ever phase wherein -PRON-...,0


In [61]:
# lemma_train.to_pickle('./lemma_train.pkl')

In [62]:
lemma_train = pd.read_pickle('./lemma_train.pkl')

In [38]:
X = lemma_train.question_text
y = lemma_train.target

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 495)

#### Default count vectorizer

In [40]:
%%time
X_train_raw, X_test_raw =  vect_trans(CountVectorizer(), X_train, X_test)

Wall time: 17 s


In [42]:
model = MultinomialNB()
model_label = 'Raw_token_NB'

model_score(model, X_train_raw, X_test_raw, y_train, y_test, score_df, model_label)
cv_score(model, X_train_raw, y_train, model_label)
score_df

Train Accuracy : 0.9367031751006287
Test Accuracy: 0.9367257626381569
Test F1 score: 0.5438972162740899
Test Accuracy Mean: 0.9316684201196621
Test Accuracy STD: 0.0005027082113257391
Test F1: 0.5144468147496293


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.936726,0.543897,0.931668,0.000503,0.514447


In [43]:
nb = MultinomialNB()
nb.fit(X_train_raw, y_train)  
test_score =  nb.score(X_test_raw, y_test)
print('train score:', nb.score(X_train_raw, y_train))
print('test score:', test_score)
y_pred = nb.predict(X_test_raw)

train score: 0.9367031751006287
test score: 0.9367257626381569


In [44]:
print(f1_score(y_test, y_pred) )
print(f1_score(y_test, y_pred, average='macro') )
print(f1_score(y_test, y_pred, average='micro') )
print(f1_score(y_test, y_pred, average='weighted') )
confusion_matrix(y_test, y_pred)

0.5438972162740899
0.754951028488399
0.9367257626381569
0.9398883606860133


array([[293551,  12777],
       [  7884,  12319]], dtype=int64)

### Remove stop words

In [45]:
%%time
X_train_t, X_test_t=  vect_trans(CountVectorizer(max_df=1.0, min_df=1, ngram_range=(1,1), 
                                                    stop_words=stopwords), X_train, X_test)

Wall time: 18.9 s


In [46]:
model = MultinomialNB()
model_label = 'Token_NB'

model_score(model, X_train_t, X_test_t, y_train, y_test, score_df, model_label)
cv_score(model, X_train_t, y_train, model_label)
score_df

Train Accuracy : 0.9366899042559599
Test Accuracy: 0.936716575149067
Test F1 score: 0.5438209192458828
Test Accuracy Mean: 0.9316520867713353
Test Accuracy STD: 0.0005082745707158826
Test F1: 0.5143515648115624


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.936726,0.543897,0.931668,0.000503,0.514447
Token_NB,0.936717,0.543821,0.931652,0.000508,0.514352


### Bi-gram

In [47]:
%%time
X_train_bi, X_test_bi=  vect_trans(CountVectorizer(ngram_range=(1,2), stop_words=stopwords), X_train, X_test)

Wall time: 47.9 s


In [48]:
model = MultinomialNB()
model_label = 'Bigram_NB'

model_score(model, X_train_bi, X_test_bi, y_train, y_test, score_df, model_label)
cv_score(model, X_train_bi, y_train, model_label)
score_df

Train Accuracy : 0.9644290321164649
Test Accuracy: 0.9460020641225488
Test F1 score: 0.41823940873696713
Test Accuracy Mean: 0.9378036343865782
Test Accuracy STD: 0.00016942306706332178
Test F1: 0.44701183939132355


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.936726,0.543897,0.931668,0.000503,0.514447
Token_NB,0.936717,0.543821,0.931652,0.000508,0.514352
Bigram_NB,0.946002,0.418239,0.937804,0.000169,0.447012


### Tri-gram

In [49]:
%%time
X_train_tri, X_test_tri=  vect_trans(CountVectorizer(ngram_range=(1,3), stop_words=stopwords), X_train, X_test)

Wall time: 1min 18s


In [50]:
model = MultinomialNB()
model_label = 'Trigram_NB'

model_score(model, X_train_tri, X_test_tri, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tri, y_train, model_label)
score_df

Train Accuracy : 0.9810921088495096
Test Accuracy: 0.944179878786394
Test F1 score: 0.27725127879773187
Test Accuracy Mean: 0.939216468581791
Test Accuracy STD: 0.0002987832989750409
Test F1: 0.4579298632261599


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.936726,0.543897,0.931668,0.000503,0.514447
Token_NB,0.936717,0.543821,0.931652,0.000508,0.514352
Bigram_NB,0.946002,0.418239,0.937804,0.000169,0.447012
Trigram_NB,0.94418,0.277251,0.939216,0.000299,0.45793


### GridSearch min/max df

In [52]:
# params = {'CV__max_df':(1.0, 0.9),
#        'CV__min_df': (1, 2, 0.01, 0.02),
#         'CV__ngram_range':((1,1), (1,2), (1,3))}

# gridcv(pipeCVNB, X_train, y_train, params)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   22.2s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   54.8s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:  2.1min
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  2.6min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  4.2min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  5.3min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  6.5min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  8.0min
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed: 10.1min
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed: 11.8min
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed: 13.7min
[Parallel(n_jobs=6)]: Done 120 out of 120 | elapsed: 16.4min finished


{'CV__max_df': 1.0, 'CV__min_df': 1, 'CV__ngram_range': (1, 2)}
0.9460979122919667


In [53]:
# %%time
# X_train_t, X_test_t=  vect_trans(CountVectorizer(max_df=1.0, min_df=1, ngram_range=(1,2), 
#                                                     stop_words=stopwords), X_train, X_test)

Wall time: 52.3 s


In [54]:
# model = MultinomialNB()
# model_label = 'Grid_DFNB'

# model_score(model, X_train_t, X_test_t, y_train, y_test, score_df, model_label)
# cv_score(model, X_train_t, y_train, model_label)
# score_df

train score: 0.9744383114993911
test score: 0.9471045628133317
Test Accuracy Mean: 0.9444359946999341
Test Accuracy STD: 0.0004057190754223053
Test F1: 0.47813979276026675


Unnamed: 0,Test_score,CV_Accuracy,CV_Acc_STD,F1_score
token_NB,0.934414,0.932161,0.000523,0.548922
stop_token_NB,0.937608,0.932782,0.000462,0.527635
stop_bi_NB,0.947105,0.944436,0.000406,0.47814
stop_bi_RF,0.938128,0.944436,0.000406,0.47814
stop_tri_NB,0.944734,0.944596,0.000425,0.505501
stop_tri_RF,0.938128,0.944596,0.000425,0.505501
stop_df_NB,0.947105,0.944436,0.000406,0.47814


### TFIDF

In [51]:
X_train_tf_t, X_test_tf_t = vect_trans(TfidfTransformer(), X_train_t, X_test_t)

In [52]:
model = MultinomialNB()
model_label = 'Tfidf_t_NB'

model_score(model, X_train_tf_t , X_test_tf_t, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tf_t, y_train, model_label)
score_df

Train Accuracy : 0.941965575428929
Test Accuracy: 0.9408693202176822
Test F1 score: 0.11746960416857119
Test Accuracy Mean: 0.939631948814068
Test Accuracy STD: 0.00010638318001755925
Test F1: 0.0881597738101094


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.936726,0.543897,0.931668,0.000503,0.514447
Token_NB,0.936717,0.543821,0.931652,0.000508,0.514352
Bigram_NB,0.946002,0.418239,0.937804,0.000169,0.447012
Trigram_NB,0.94418,0.277251,0.939216,0.000299,0.45793
Tfidf_t_NB,0.940869,0.11747,0.939632,0.000106,0.08816


### NLTK Best Bigrams

In [55]:
# create one list of all question tokens
full_text = []

for text in X_train:
    full_text += [w for w in nltk.word_tokenize(text.lower()) if w not in stopwords]

In [56]:
len(full_text)

7226905

In [57]:
# create bigram vocabulary
bigram_measures = collocations.BigramAssocMeasures()

finder = nltk.BigramCollocationFinder.from_words(full_text)
# scored = finder.score_ngrams( bigram_measures.likelihood_ratio  )
bigram_vocab = finder.nbest(bigram_measures.likelihood_ratio, 40)
bigram_vocab

[('-pron-', '-pron-'),
 ('-pron-', 'get'),
 ('united', 'states'),
 ('year', 'old'),
 ('good', 'way'),
 ('donald', 'trump'),
 ('-pron-', 'think'),
 ('would', '-pron-'),
 ('-pron-', 'want'),
 ('-pron-', 'possible'),
 ('-pron-', 'find'),
 ('computer', 'science'),
 ('even', 'though'),
 ('north', 'korea'),
 ('high', 'school'),
 ('-pron-', 'feel'),
 ('social', 'medium'),
 ('-pron-', 'know'),
 ('would', 'happen'),
 ('get', 'rid'),
 ('major', 'accomplishment'),
 ('look', 'like'),
 ('jee', 'mains'),
 ('pro', 'con'),
 ('-pron-', 'ever'),
 ('-pron-', 'take'),
 ('tell', '-pron-'),
 ('new', 'york'),
 ('-pron-', 'need'),
 ('feel', 'like'),
 ('would', 'win'),
 ('tv', 'show'),
 ('harry', 'potter'),
 ('real', 'estate'),
 ('ssc', 'cgl'),
 ('saudi', 'arabia'),
 ('good', '-pron-'),
 ('star', 'wars'),
 ('good', 'place'),
 ('mechanical', 'engineering')]

In [58]:
# create bigram vocabulary
bigram_measures = collocations.BigramAssocMeasures()


finder3 = nltk.BigramCollocationFinder.from_words(full_text)
finder3.apply_word_filter(lambda x: x in stopwords)
scored = finder3.score_ngrams(bigram_measures.pmi)
for bscore in scored[:30]:
    print (bscore)



# finder = nltk.BigramCollocationFinder.from_words(full_text)
# # scored = finder.score_ngrams( bigram_measures.likelihood_ratio  )
# bigram_vocab = finder.nbest(bigram_measures.pmi, 40)
# bigram_vocab

(('\x02tñ\x7f¼é\x1aaùõ\x8d¶rwìiìñó', '\x10œø'), 22.78494649905416)
(('\x10œø', '\x17'), 22.78494649905416)
(('\x17', 'y.¾ƒe'), 22.78494649905416)
(('+5', '=3\\sqrt'), 22.78494649905416)
(('+\\sum_', '\\theta=8'), 22.78494649905416)
(('+z^2', 'dzdydx'), 22.78494649905416)
((',13sin', '13sin'), 22.78494649905416)
(('-.1', 'b=3.4'), 22.78494649905416)
(('-1,6', '-1,0'), 22.78494649905416)
(('-12288', '-61440'), 22.78494649905416)
(('-16t^2', '14t+4'), 22.78494649905416)
(('-2.50', '-2.75.both'), 22.78494649905416)
(('-207.113', '0.00206x'), 22.78494649905416)
(('-210j', 'k^-1'), 22.78494649905416)
(('-250', 'azithromycin-500'), 22.78494649905416)
(('-2x+y', 'dx=0'), 22.78494649905416)
(("-2y'-3y", 'te^'), 22.78494649905416)
(('-3009', '-416'), 22.78494649905416)
(('-4.4', '-4.04'), 22.78494649905416)
(('-43.309439', '-97.978697'), 22.78494649905416)
(('-5x3', '-33x2=3x=18'), 22.78494649905416)
(('-6,8', '1,3,7'), 22.78494649905416)
(('-61440', '-7290'), 22.78494649905416)
(('-65610', '-12

In [59]:
# create trigram vocabulary
trigram_measures = collocations.TrigramAssocMeasures()
finder = nltk.TrigramCollocationFinder.from_words(full_text)
trigram_vocab = finder.nbest(trigram_measures.likelihood_ratio, 20)
trigram_vocab

[('-pron-', '-pron-', '-pron-'),
 ('-pron-', '-pron-', 'get'),
 ('-pron-', '-pron-', 'think'),
 ('would', '-pron-', '-pron-'),
 ('-pron-', '-pron-', 'want'),
 ('would', '-pron-', 'get'),
 ('-pron-', '-pron-', 'find'),
 ('-pron-', '-pron-', 'possible'),
 ('-pron-', '-pron-', 'feel'),
 ('-pron-', '-pron-', 'know'),
 ('tell', '-pron-', '-pron-'),
 ('-pron-', 'get', '-pron-'),
 ('-pron-', '-pron-', 'ever'),
 ('-pron-', '-pron-', 'take'),
 ('good', '-pron-', '-pron-'),
 ('-pron-', '-pron-', 'need'),
 ('-pron-', 'get', 'rid'),
 ('-pron-', '-pron-', 'see'),
 ('-pron-', '-pron-', 'favorite'),
 ('-pron-', '-pron-', 'true')]

In [None]:
# recreate text using ngrams
def ngram_to_corpus(df, text_col, ngram_list, n, new_col):
#     ngram_list = set({('let', 'us'), ('as', 'soon')})  # {('let', 'us'), ('as', 'soon')}
#     tokens = ['please', 'let', 'us', 'know', 'as', 'soon', 'as', 'possible']
    new_data = []
    for text in df[text_col]:
        tokens = nltk.word_tokenize(text)
        output = []
        q_iter = iter(range(len(tokens)))
        
        for idx in q_iter:
            output.append(tokens[idx])
            if n == 2:
                if idx < (len(tokens) - 1) and (tokens[idx], tokens[idx+1]) in ngram_list:
                    output[-1] += '_' + tokens[idx+1]
                    next(q_iter)
            elif n == 3:
                if idx < (len(tokens) - 2) and (tokens[idx], tokens[idx+1], tokens[idx+2] ) in ngram_list:
                    output[-1] += '_' + tokens[idx+1] + '_' + tokens[idx+2]
                    next(q_iter)
                    next(q_iter)
        new_data.append( ' '.join(output))
    df[new_col] = new_data

In [205]:
# create text with bigram replacement
ngram_to_corpus(train, 'question_text', bigram_vocab, 2, 'bigram_question_lkhd')

In [206]:
# create text with both tri and bigram in text, by applying trigram first
ngram_to_corpus(train, 'question_text', trigram_vocab, 3, 'trigram_question_lkhd')
ngram_to_corpus(train, 'question_text', bigram_vocab, 2,  'trigram_question_lkhd')

In [207]:
X = train[['bigram_questions_lkhd','trigram_questions_lkhd']]
y = train.target

In [208]:
X_train_f, X_test_f, y_train, y_test = train_test_split(X, y, stratify=y, random_state=90, test_size=0.2)

#### Bigram Model

283.33333333333337