In [6]:
# text manipulation
import re
import string

# Data management
import pandas as pd
import numpy as np
from scipy.sparse import *
import scipy

# NLP
import nltk
import nltk.collocations as collocations
from nltk.tag import tnt
import spacy

# modelling
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import f1_score, confusion_matrix

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [7]:
train = pd.read_csv('./train.csv')

In [8]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [9]:
train.shape

(1306122, 3)

In [10]:
no_insincere = train[train['target']==1].target.count()
no_sincere = train[train['target']==0].target.count()

print('No. of insincere questions:', no_insincere)
print('No. of sincere questions:', no_sincere)
print('% of insincere questions:', train.target.mean())
print('Null score:', 1- train.target.mean())

No. of insincere questions: 80810
No. of sincere questions: 1225312
% of insincere questions: 0.06187017751787352
Null score: 0.9381298224821265


Number of lanuages

In [11]:
# lang_list = set([langid.classify(s)[0] for s in train['question_text']])


In [12]:
# # Some are questions in different lanuages, some are questions regarding different lanuages.
# basic_stats['no_of_lang'] = len(lang_list)
# basic_stats

Creating train/test sets

In [13]:
X = train.question_text
y = train.target

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 495)

# Define functions and pipelines

In [15]:
def vect_trans(vectorizer, X_train, X_test):
    # can also take a transformer
    vect = vectorizer
    vect.fit(X_train)
    return vect.transform(X_train), vect.transform(X_test)

In [16]:
# MultinominalNB function for printing scores and storing into df.
def model_score(model, X_train, X_test, y_train, y_test, score_df, model_label):
    estimator = model
    estimator.fit(X_train, y_train)
    test_score =  estimator.score(X_test, y_test)
    f1 = f1_score(y_test, estimator.predict(X_test))
    
    print('Train Accuracy :', estimator.score(X_train, y_train))
    print('Test Accuracy:', test_score)
    print('Test F1 score:', f1)
    score_df.loc[model_label, 'Test_Accuracy'] = test_score
    score_df.loc[model_label, 'Test_F1_score'] = f1

In [17]:
# Cross Validate function for printing scores and storing into df.
def cv_score(model, X, y, model_label,  cv=5, ):    
    
    # instantiating model
    estimator = model
    
    cv_result = cross_validate(estimator, X, y, cv = cv, n_jobs=-1, scoring=['accuracy', 'f1'])
    
    print('Test Accuracy Mean:',cv_result['test_accuracy'].mean())
    print('Test Accuracy STD:',cv_result['test_accuracy'].std())
    print('Test F1:', cv_result['test_f1'].mean())
    score_df.loc[model_label, 'CV_Accuracy'] = cv_result['test_accuracy'].mean()
    score_df.loc[model_label, 'CV_Acc_STD'] = cv_result['test_accuracy'].std()
    score_df.loc[model_label, 'CV_F1_score'] = cv_result['test_f1'].mean()

In [18]:
# GridSearchCV function, auto display best score and parameters and storing in df
def gridcv(model, X, y, params, cv= 5 ):
    
    # instantiating model can also be a pipeline
    estimator = model
    
    gridcv = GridSearchCV(estimator=estimator, param_grid=params, cv = cv, verbose=10, n_jobs=6)
    gridcv.fit(X, y)
    
    print(gridcv.best_params_)
    print(gridcv.best_score_)
    

In [19]:
stopwords = list(nltk.corpus.stopwords.words('english')) + list(string.punctuation) + ["''", '``','â€™', "'s", "'d", "'ll", "'t"]

In [20]:
# CountVectorizer pipeline and parameters
pipeCVNB = Pipeline([('CV',CountVectorizer(stop_words=stopwords)), 
                    ('NB',MultinomialNB())])

paramsCVNB = {'CV__max_df':(1.0, 0.9, 0.8, 0.7),
       'CV__min_df': (1, 2, 0.01 , 0.1, 0.2),
         'CV__ngram_range':((1,1), (1,2), (1,3))}

In [21]:
# TfidfVectorizer pipeline and parameters
pipeTVNB = Pipeline([('TV',TfidfVectorizer(stop_words=stopwords)), 
                    ('NB',MultinomialNB())])

paramsTVNB = {'TV__max_df':(1.0, 0.9, 0.8, 0.7, 0.6),
       'TV__min_df': (1, 2, 0.01, 0.05, 0.1),
         'TV__ngram_range':((1,1), (1,2), (1,3), (2,2), (2,3))}

In [22]:
score_df = pd.DataFrame()

# Basic Token and Ngram modelling

#### Default count vectorizer

In [23]:
%%time
X_train_raw, X_test_raw=  vect_trans(CountVectorizer(), X_train, X_test)

Wall time: 32.2 s


In [24]:
model = MultinomialNB()
model_label = 'Raw_token_NB'

model_score(model, X_train_raw, X_test_raw, y_train, y_test, score_df, model_label)
cv_score(model, X_train_raw, y_train, model_label)
score_df

Train Accuracy : 0.9350739237089765
Test Accuracy: 0.9344135778838762
Test F1 score: 0.5646092542896641
Test Accuracy Mean: 0.9321614838567932
Test Accuracy STD: 0.0005228593021778298
Test F1: 0.5489215714930169


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922


In [25]:
nb = MultinomialNB()
nb.fit(X_train_raw, y_train)  
test_score =  nb.score(X_test_raw, y_test)
print('train score:', nb.score(X_train_raw, y_train))
print('test score:', test_score)
y_pred = nb.predict(X_test_raw)

train score: 0.9350739237089765
test score: 0.9344135778838762


In [26]:
print(f1_score(y_test, y_pred) )
print(f1_score(y_test, y_pred, average='macro') )
print(f1_score(y_test, y_pred, average='micro') )
print(f1_score(y_test, y_pred, average='weighted') )
confusion_matrix(y_test, y_pred)

0.5646092542896641
0.7645724512273393
0.9344135778838762
0.9397915566837656


array([[291229,  15099],
       [  6317,  13886]], dtype=int64)

### Remove stop words

In [27]:
%%time
X_train_t, X_test_t=  vect_trans(CountVectorizer(max_df=1.0, min_df=1, ngram_range=(1,1), 
                                                    stop_words=stopwords), X_train, X_test)

Wall time: 26.4 s


In [28]:
model = MultinomialNB()
model_label = 'Token_NB'

model_score(model, X_train_t, X_test_t, y_train, y_test, score_df, model_label)
cv_score(model, X_train_t, y_train, model_label)
score_df

Train Accuracy : 0.938349780673771
Test Accuracy: 0.9376077615907831
Test F1 score: 0.5548150252387299
Test Accuracy Mean: 0.9327821506346978
Test Accuracy STD: 0.00046156009254156113
Test F1: 0.52763522293454


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.937608,0.554815,0.932782,0.000462,0.527635


### Bi-gram

In [29]:
%%time
X_train_bi, X_test_bi=  vect_trans(CountVectorizer(ngram_range=(1,2), stop_words=stopwords), X_train, X_test)

Wall time: 58.5 s


Naive Bayes

In [30]:
model = MultinomialNB()
model_label = 'Bigram_NB'

model_score(model, X_train_bi, X_test_bi, y_train, y_test, score_df, model_label)
cv_score(model, X_train_bi, y_train, model_label)
score_df

Train Accuracy : 0.9744383114993911
Test Accuracy: 0.9471045628133317
Test F1 score: 0.39144528222112607
Test Accuracy Mean: 0.9444359946999341
Test Accuracy STD: 0.0004057190754223053
Test F1: 0.47813979276026675


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.937608,0.554815,0.932782,0.000462,0.527635
Bigram_NB,0.947105,0.391445,0.944436,0.000406,0.47814


Random Forest

In [31]:
# %%time
# model = RandomForestClassifier(n_estimators= 100, max_depth=20, n_jobs=-1)
# model_label = 'Bigram_RF'


# model_score(model, X_train_bi, X_test_bi, y_train, y_test, score_df, model_label)
# cv_score(model, X_train_bi, y_train, model_label)
# score_df

In [32]:
# score_df

### Tri-gram

In [33]:
%%time
X_train_tri, X_test_tri=  vect_trans(CountVectorizer(ngram_range=(1,3), stop_words=stopwords), X_train, X_test)

Wall time: 1min 27s


Naive Bayes

In [34]:
model = MultinomialNB()
model_label = 'Trigram_NB'

model_score(model, X_train_tri, X_test_tri, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tri, y_train, model_label)
score_df

Train Accuracy : 0.9880582814664488
Test Accuracy: 0.9447341906281487
Test F1 score: 0.26915600194394945
Test Accuracy Mean: 0.9445962656878848
Test Accuracy STD: 0.0004246498388815797
Test F1: 0.5055013527650977


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.937608,0.554815,0.932782,0.000462,0.527635
Bigram_NB,0.947105,0.391445,0.944436,0.000406,0.47814
Trigram_NB,0.944734,0.269156,0.944596,0.000425,0.505501


Random Forest

In [35]:
# %%time
# model = RandomForestClassifier(n_estimators= 100, max_depth=20, n_jobs=-1)
# model_label = 'Trigram_RF'

# model_score(model, X_train_tri, X_test_tri, y_train, y_test, score_df, model_label)
# cv_score(model, X_train_tri, y_train, model_label)


In [36]:
# score_df

### GridSearch min/max df

In [37]:
# params = {'CV__max_df':(1.0, 0.9),
#        'CV__min_df': (1, 2, 0.01, 0.02),
#         'CV__ngram_range':((1,1), (1,2), (1,3))}

# gridcv(pipeCVNB, X_train, y_train, params)

In [38]:
# %%time
# X_train_t, X_test_t=  vect_trans(CountVectorizer(max_df=1.0, min_df=1, ngram_range=(1,2), 
#                                                     stop_words=stopwords), X_train, X_test)

In [39]:
# model = MultinomialNB()
# model_label = 'Grid_DFNB'

# model_score(model, X_train_t, X_test_t, y_train, y_test, score_df, model_label)
# cv_score(model, X_train_t, y_train, model_label)
# score_df

### TFIDF

In [40]:
X_train_tf_t, X_test_tf_t = vect_trans(TfidfTransformer(), X_train_t, X_test_t)

In [41]:
model = MultinomialNB()
model_label = 'Tfidf_t_NB'

model_score(model, X_train_tf_t , X_test_tf_t, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tf_t, y_train, model_label)
score_df

Train Accuracy : 0.9426291176623713
Test Accuracy: 0.9413470696503548
Test F1 score: 0.13276580329650425
Test Accuracy Mean: 0.9400800951073152
Test Accuracy STD: 0.00015736604219783085
Test F1: 0.09913679748013234


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.937608,0.554815,0.932782,0.000462,0.527635
Bigram_NB,0.947105,0.391445,0.944436,0.000406,0.47814
Trigram_NB,0.944734,0.269156,0.944596,0.000425,0.505501
Tfidf_t_NB,0.941347,0.132766,0.94008,0.000157,0.099137


### NLTK Best Bigrams

In [42]:
# create one list of all question tokens
full_text = []

for text in X_train:
    full_text += [w for w in nltk.word_tokenize(text.lower()) if w not in stopwords]

In [43]:
len(full_text)

6390312

In [44]:
# create bigram vocabulary
bigram_measures = collocations.BigramAssocMeasures()

finder = nltk.BigramCollocationFinder.from_words(full_text)
# scored = finder.score_ngrams( bigram_measures.likelihood_ratio  )
bigram_vocab = finder.nbest(bigram_measures.likelihood_ratio, 40)
bigram_vocab

[('ca', "n't"),
 ('united', 'states'),
 ('best', 'way'),
 ('donald', 'trump'),
 ('year', 'old'),
 ('computer', 'science'),
 ('even', 'though'),
 ('high', 'school'),
 ('would', 'happen'),
 ('social', 'media'),
 ('north', 'korea'),
 ('pros', 'cons'),
 ('get', 'rid'),
 ('major', 'accomplishments'),
 ('jee', 'mains'),
 ('look', 'like'),
 ('wo', "n't"),
 ('would', 'win'),
 ('new', 'york'),
 ('machine', 'learning'),
 ('harry', 'potter'),
 ('years', 'old'),
 ('real', 'estate'),
 ('long', 'take'),
 ('feel', 'like'),
 ('saudi', 'arabia'),
 ('star', 'wars'),
 ('ssc', 'cgl'),
 ('mechanical', 'engineering'),
 ('elon', 'musk'),
 ('tv', 'show'),
 ('hillary', 'clinton'),
 ('hong', 'kong'),
 ('tamil', 'nadu'),
 ('president', 'trump'),
 ('useful', 'tips'),
 ('san', 'francisco'),
 ('different', 'types'),
 ('hotels', 'short-term'),
 ('artificial', 'intelligence')]

In [45]:
# create bigram vocabulary
bigram_measures = collocations.BigramAssocMeasures()


finder3 = nltk.BigramCollocationFinder.from_words(full_text)
finder3.apply_word_filter(lambda x: x in stopwords)
scored = finder3.nbest(bigram_measures.raw_freq, 40)
for bscore in scored[:30]:
    print (bscore)



# finder = nltk.BigramCollocationFinder.from_words(full_text)
# # scored = finder.score_ngrams( bigram_measures.likelihood_ratio  )
# bigram_vocab = finder.nbest(bigram_measures.pmi, 40)
# bigram_vocab

('ca', "n't")
('best', 'way')
('year', 'old')
('would', 'happen')
('united', 'states')
('donald', 'trump')
('look', 'like')
('high', 'school')
('feel', 'like')
('computer', 'science')
('many', 'people')
('get', 'rid')
('would', 'win')
('even', 'though')
('social', 'media')
('would', 'like')
('get', 'job')
('long', 'take')
('years', 'old')
('north', 'korea')
("n't", 'want')
('wo', "n't")
("n't", 'know')
('people', 'think')
('jee', 'mains')
('major', 'accomplishments')
('best', 'ways')
('make', 'money')
('much', 'time')
('much', 'money')


In [46]:
# create trigram vocabulary
trigram_measures = collocations.TrigramAssocMeasures()
finder = nltk.TrigramCollocationFinder.from_words(full_text)
trigram_vocab = finder.nbest(trigram_measures.likelihood_ratio, 20)
trigram_vocab

[('ca', "n't", 'want'),
 ('ca', "n't", 'know'),
 ('ca', "n't", 'like'),
 ('ca', "n't", 'see'),
 ('ca', "n't", 'understand'),
 ('ca', "n't", 'care'),
 ('ca', "n't", 'afford'),
 ('ca', "n't", 'even'),
 ('ca', "n't", 'get'),
 ('ca', "n't", 'find'),
 ('ca', "n't", 'seem'),
 ('ca', "n't", 'feel'),
 ('ca', "n't", 'exist'),
 ('ca', "n't", 'believe'),
 ('ca', "n't", 'remember'),
 ('ca', "n't", 'let'),
 ('ca', "n't", 'stop'),
 ('ca', "n't", 'talk'),
 ('ca', "n't", 'think'),
 ('ca', "n't", 'would')]

In [47]:
# recreate text using ngrams
def ngram_to_corpus(df, text_col, ngram_list, n, new_col):
#     ngram_list = set({('let', 'us'), ('as', 'soon')})  # {('let', 'us'), ('as', 'soon')}
#     tokens = ['please', 'let', 'us', 'know', 'as', 'soon', 'as', 'possible']
    new_data = []
    for text in df[text_col]:
        tokens = nltk.word_tokenize(text)
        output = []
        q_iter = iter(range(len(tokens)))
        
        for idx in q_iter:
            output.append(tokens[idx])
            if n == 2:
                if idx < (len(tokens) - 1) and (tokens[idx], tokens[idx+1]) in ngram_list:
                    output[-1] += '_' + tokens[idx+1]
                    next(q_iter)
            elif n == 3:
                if idx < (len(tokens) - 2) and (tokens[idx], tokens[idx+1], tokens[idx+2] ) in ngram_list:
                    output[-1] += '_' + tokens[idx+1] + '_' + tokens[idx+2]
                    next(q_iter)
                    next(q_iter)
        new_data.append( ' '.join(output))
    df[new_col] = new_data

In [48]:
%%time
# create text with bigram replacement
ngram_to_corpus(train, 'question_text', bigram_vocab, 2, 'bigram_question_lkhd')

Wall time: 3min 5s


In [49]:
%%time
# create text with both tri and bigram in text, by applying trigram first
ngram_to_corpus(train, 'question_text', trigram_vocab, 3, 'trigram_question_lkhd')
ngram_to_corpus(train, 'trigram_question_lkhd', bigram_vocab, 2,  'trigram_question_lkhd')

Wall time: 5min 54s


In [50]:
X = train[['bigram_question_lkhd','trigram_question_lkhd']]
y = train.target

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=90, test_size=0.2)

In [52]:
len(X_train)

1044897

In [53]:
X_train.head()

Unnamed: 0,bigram_question_lkhd,trigram_question_lkhd
488601,How do you respond to a best friend criticisin...,How do you respond to a best friend criticisin...
1214018,What are some funny excuses that you have come...,What are some funny excuses that you have come...
144139,"What laws on the national , state and local le...","What laws on the national , state and local le..."
694377,If you vote for a democrat because you are a d...,If you vote for a democrat because you are a d...
268735,"What do you think about the new album of BTS ,...","What do you think about the new album of BTS ,..."


#### Bigram Model

In [54]:
# model using bigram text
X_train_bi, X_test_bi =  vect_trans(CountVectorizer(max_df=1.0,  min_df=1, ngram_range=(1,1), stop_words=stopwords),
                                   X_train.bigram_question_lkhd, X_test.bigram_question_lkhd,)



In [55]:
X_train_bi

<1044897x173235 sparse matrix of type '<class 'numpy.int64'>'
	with 6514901 stored elements in Compressed Sparse Row format>

In [56]:
X_test_bi

<261225x173235 sparse matrix of type '<class 'numpy.int64'>'
	with 1604440 stored elements in Compressed Sparse Row format>

In [57]:
model = MultinomialNB()
model_label = 'Bigram_best_NB'

model_score(model, X_train_bi, X_test_bi, y_train, y_test, score_df, model_label)
cv_score(model, X_train_bi, y_train, model_label)
score_df

Train Accuracy : 0.9382618573888144
Test Accuracy: 0.936824576514499
Test F1 score: 0.5497012196785723
Test Accuracy Mean: 0.9329656418376958
Test Accuracy STD: 0.0005744725754048589
Test F1: 0.5304411678616323


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.937608,0.554815,0.932782,0.000462,0.527635
Bigram_NB,0.947105,0.391445,0.944436,0.000406,0.47814
Trigram_NB,0.944734,0.269156,0.944596,0.000425,0.505501
Tfidf_t_NB,0.941347,0.132766,0.94008,0.000157,0.099137
Bigram_best_NB,0.936825,0.549701,0.932966,0.000574,0.530441


#### Trigram Model

In [58]:
# model using bigram text
X_train_tri, X_test_tri =  vect_trans(CountVectorizer(max_df=1.0,  min_df=1, ngram_range=(1,1), stop_words=stopwords),
                                   X_train.trigram_question_lkhd, X_test.trigram_question_lkhd)



In [59]:
model = MultinomialNB()
model_label = 'Trigram_best_NB'

model_score(model, X_train_tri, X_test_tri, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tri, y_train, model_label)
score_df

Train Accuracy : 0.9382647284851999
Test Accuracy: 0.9368169202794526
Test F1 score: 0.5496712231589861
Test Accuracy Mean: 0.932959899611036
Test Accuracy STD: 0.0005660897195800612
Test F1: 0.5304195331100028


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.937608,0.554815,0.932782,0.000462,0.527635
Bigram_NB,0.947105,0.391445,0.944436,0.000406,0.47814
Trigram_NB,0.944734,0.269156,0.944596,0.000425,0.505501
Tfidf_t_NB,0.941347,0.132766,0.94008,0.000157,0.099137
Bigram_best_NB,0.936825,0.549701,0.932966,0.000574,0.530441
Trigram_best_NB,0.936817,0.549671,0.93296,0.000566,0.53042
