In [1]:
# text manipulation
import re
import string

# Data management
import pandas as pd
import numpy as np
from scipy.sparse import *
import scipy

# NLP
import nltk
import nltk.collocations as collocations
from nltk.tag import tnt
import spacy

# modelling
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.cluster import MeanShift

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, clear_output

%matplotlib inline

In [2]:
train = pd.read_csv('./train.csv')

In [3]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
train.shape

(1306122, 3)

In [5]:
no_insincere = train[train['target']==1].target.count()
no_sincere = train[train['target']==0].target.count()

print('No. of insincere questions:', no_insincere)
print('No. of sincere questions:', no_sincere)
print('% of insincere questions:', train.target.mean())
print('Null score:', 1- train.target.mean())

No. of insincere questions: 80810
No. of sincere questions: 1225312
% of insincere questions: 0.06187017751787352
Null score: 0.9381298224821265


# Data Cleaning

In [6]:
%%time
# remove digits
clean_questions = [''.join(c for c in q if not c.isdigit()) for q in train.question_text]

Wall time: 10.1 s


In [8]:
# replacing all identity labels each question with a common labels
def labels_to_question(data, label_list, label_type):
         
    new_data = []
    
    # For every questions
    i_data = 0
    for i_data in range(len(data)):
        question = data[i_data].lower()
        output = []
        
        # compare each label to question
        for label in label_list:

            if label in question:

                que_t = nltk.word_tokenize(question)
                lab_t = nltk.word_tokenize(label)

                i_que = 0
                while i_que < len(que_t):
                    i_lab = 0
                    
                    # If current token is same as first label token, continue compare rest of the tokens. 
                    if que_t[i_que] == lab_t[0]:
                        que_t[i_que] = label_type
                        i_lab += 1
                        i_que += 1

                        # Remove trailing question tokens if they match trailing label tokens
                        while i_lab < len(lab_t):
                            if que_t[i_que] == lab_t[i_lab]:
                                que_t.pop(i_que)
                                i_lab += 1
                            else:
                                break
#                     elif que_t[i_que] == lab_t[0]:
#                         print('Question: ',question, i_data)
#                         print('label: ', label)
                    i_que += 1
                question = ' '.join(que_t)
#                 print('after: ', question)
#                 print('label: ', label)
        new_data.append(question)                   
        if i_data % 1000 == 0:
            clear_output(wait=True)
            display(i_data)
    return new_data


In [9]:
f = open('nationalities.txt', 'r')
nationalities = []
for n in f:
    nationalities.append(n.strip().lower())
f.close()
nationalities = set(nationalities)

In [16]:
# Identity groupd filters, created from online lists, most frequent insincere words and manual editing.
ID_filter = pd.read_csv('ID_filter.csv')

In [17]:
ID_filter.head(2)

Unnamed: 0,RELIGIOUS_ID,RACIAL_ID,NATIONAL_ID,NATIONALITY_ID,GENDER_ID,Unnamed: 5,Political_groups,Political_figure
0,buddhist,white people,Afghanistan,Afghans,girls,,trump supporters,donald trump
1,catholic,black people,Albania,Albanians,boys,,democrate,president trump


In [24]:
religious_ID = ID_filter.RELIGIOUS_ID.dropna()
racial_ID = ID_filter.RACIAL_ID.dropna()
national_ID = ID_filter.NATIONAL_ID.dropna()
nationality_ID = ID_filter.NATIONALITY_ID.dropna()



In [26]:
%%time
clean_questions = labels_to_question(clean_questions, religious_ID, 'RELIGIOUS_ID')
clean_questions = labels_to_question(clean_questions, racial_ID, 'RACIAL_ID')
clean_questions = labels_to_question(clean_questions, national_ID, 'NATIONAL_ID')
clean_questions = labels_to_question(clean_questions, nationality_ID, 'NATIONALITY_ID')

1306000

Wall time: 2min 16s


In [143]:
# q_sample = ['today united states will raise taxes', 'How did donald trump junior manage this']
# label_sample = ('raise','united states', 'donald trump junior')
# temp = labels_to_question(q_sample, label_sample, 'REPLACED')
# temp

0

['today REPLACED will REPLACED taxes', 'how did REPLACED manage this']

In [27]:
stopwords = list(nltk.corpus.stopwords.words('english')) + list(string.punctuation) + ["''", '``','’','“','”', "'s", "'d", "'ll", "'t", "n't", "ca", 'wo']

In [28]:
%%time
# remove stop words and lower all characters
clean_questions = [' '.join(w for w in nltk.word_tokenize(q.lower()) if w not in stopwords) for q in clean_questions]

Wall time: 3min 6s


In [29]:
len(clean_questions)

1306122

In [30]:
train.question_text[1]

'Do you have an adopted dog, how would you encourage people to adopt and not shop?'

In [31]:
clean_questions[:10]

['quebec nationalists see province nation',
 'adopted dog would encourage people adopt shop',
 'velocity affect time velocity affect space geometry',
 'otto von guericke used magdeburg hemispheres',
 'convert montra helicon mountain bike changing tyres',
 'gaza slowly becoming auschwitz dachau treblinka palestinians',
 'quora automatically ban conservative opinions reported liberal views',
 'crazy wash wipe groceries germs everywhere',
 'thing dressing moderately different dressing modestly',
 'ever phase wherein became ignorant people loved completely disregarding feelings/lives get something go way feel temporarily ease things change']

# Define functions and pipelines

In [32]:
def vect_trans(vectorizer, X_train, X_test):
    # can also take a transformer
    vect = vectorizer
    vect.fit(X_train)
    return vect.transform(X_train), vect.transform(X_test)

In [33]:
# MultinominalNB function for printing scores and storing into df.
def model_score(model, X_train, X_test, y_train, y_test, score_df, model_label):
    estimator = model
    estimator.fit(X_train, y_train)
    test_score =  estimator.score(X_test, y_test)
    f1 = f1_score(y_test, estimator.predict(X_test))
    
    print('Train Accuracy :', estimator.score(X_train, y_train))
    print('Test Accuracy:', test_score)
    print('Test F1 score:', f1)
    score_df.loc[model_label, 'Test_Accuracy'] = test_score
    score_df.loc[model_label, 'Test_F1_score'] = f1

In [34]:
# Cross Validate function for printing scores and storing into df.
def cv_score(model, X, y, model_label,  cv=5, ):    
    
    # instantiating model
    estimator = model
    
    cv_result = cross_validate(estimator, X, y, cv = cv, n_jobs=-1, scoring=['accuracy', 'f1'])
    
    print('Test Accuracy Mean:',cv_result['test_accuracy'].mean())
    print('Test Accuracy STD:',cv_result['test_accuracy'].std())
    print('Test F1:', cv_result['test_f1'].mean())
    score_df.loc[model_label, 'CV_Accuracy'] = cv_result['test_accuracy'].mean()
    score_df.loc[model_label, 'CV_Acc_STD'] = cv_result['test_accuracy'].std()
    score_df.loc[model_label, 'CV_F1_score'] = cv_result['test_f1'].mean()

In [35]:
# GridSearchCV function, auto display best score and parameters and storing in df
def gridcv(model, X, y, params, cv= 5 ):
    
    # instantiating model can also be a pipeline
    estimator = model
    
    gridcv = GridSearchCV(estimator=estimator, param_grid=params, cv = cv, verbose=10, n_jobs=6)
    gridcv.fit(X, y)
    
    print(gridcv.best_params_)
    print(gridcv.best_score_)
    

In [36]:
# CountVectorizer pipeline and parameters
pipeCVNB = Pipeline([('CV',CountVectorizer(stop_words=stopwords)), 
                    ('NB',MultinomialNB())])

paramsCVNB = {'CV__max_df':(1.0, 0.9, 0.8, 0.7),
       'CV__min_df': (1, 2, 0.01 , 0.1, 0.2),
         'CV__ngram_range':((1,1), (1,2), (1,3))}

In [37]:
# TfidfVectorizer pipeline and parameters
pipeTVNB = Pipeline([('TV',TfidfVectorizer(stop_words=stopwords)), 
                    ('NB',MultinomialNB())])

paramsTVNB = {'TV__max_df':(1.0, 0.9, 0.8, 0.7, 0.6),
       'TV__min_df': (1, 2, 0.01, 0.05, 0.1),
         'TV__ngram_range':((1,1), (1,2), (1,3), (2,2), (2,3))}

In [38]:
score_df = pd.DataFrame()

# Default count vectorizer on raw text

In [39]:
%%time

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(train.question_text, train.target,
                                                                    stratify=train.target, random_state = 495)

X_train_raw_t, X_test_raw_t=  vect_trans(CountVectorizer(), X_train_raw, X_test_raw)

Wall time: 30.9 s


In [40]:
model = MultinomialNB()
model_label = 'Raw_token_NB'

model_score(model, X_train_raw_t, X_test_raw_t, y_train_raw, y_test_raw, score_df, model_label)
cv_score(model, X_train_raw_t, y_train_raw, model_label)
score_df

Train Accuracy : 0.9350739237089765
Test Accuracy: 0.9344135778838762
Test F1 score: 0.5646092542896641
Test Accuracy Mean: 0.9321614838567932
Test Accuracy STD: 0.0005228593021778298
Test F1: 0.5489215714930169


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922


In [41]:
nb = MultinomialNB()
nb.fit(X_train_raw_t, y_train_raw)  
test_score =  nb.score(X_test_raw_t, y_test_raw)
print('train score:', nb.score(X_train_raw_t, y_train_raw))
print('test score:', test_score)
y_pred = nb.predict(X_test_raw_t)

train score: 0.9350739237089765
test score: 0.9344135778838762


In [42]:
print(f1_score(y_test_raw, y_pred) )
print(f1_score(y_test_raw, y_pred, average='macro') )
print(f1_score(y_test_raw, y_pred, average='micro') )
print(f1_score(y_test_raw, y_pred, average='weighted') )
confusion_matrix(y_test_raw, y_pred)

0.5646092542896641
0.7645724512273393
0.9344135778838762
0.9397915566837656


array([[291229,  15099],
       [  6317,  13886]], dtype=int64)

# Basic Token and Ngram modelling on cleaned data

Creating train/test sets

In [43]:
X = clean_questions
y = train.target

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 495)

### Tokens only

In [45]:
%%time
X_train_t, X_test_t=  vect_trans(CountVectorizer(max_df=1.0, min_df=1, ngram_range=(1,1)), X_train, X_test)

Wall time: 17.2 s


In [46]:
model = MultinomialNB()
model_label = 'Token_NB'
X_train_arg = X_train_t
X_test_arg = X_test_t

model_score(model, X_train_arg, X_test_arg, y_train, y_test, score_df, model_label)
cv_score(model, X_train_arg, y_train, model_label)
score_df

Train Accuracy : 0.937569863340925
Test Accuracy: 0.9369248249017704
Test F1 score: 0.553018794218499
Test Accuracy Mean: 0.9321635251469212
Test Accuracy STD: 0.00044987149065455986
Test F1: 0.5265939444870608


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936925,0.553019,0.932164,0.00045,0.526594


### All Bigrams

In [47]:
%%time
X_train_bi, X_test_bi=  vect_trans(CountVectorizer(ngram_range=(1,2)), X_train, X_test)

Wall time: 50 s


Naive Bayes

In [48]:
model = MultinomialNB()
model_label = 'Bigram_NB'
X_train_arg = X_train_bi
X_test_arg = X_test_bi

model_score(model, X_train_arg, X_test_arg, y_train, y_test, score_df, model_label)
cv_score(model, X_train_arg, y_train, model_label)
score_df

Train Accuracy : 0.9739656652623391
Test Accuracy: 0.9469300005206244
Test F1 score: 0.393348503413268
Test Accuracy Mean: 0.9441930363487323
Test Accuracy STD: 0.00040737926477786937
Test F1: 0.4762995403630869


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936925,0.553019,0.932164,0.00045,0.526594
Bigram_NB,0.94693,0.393349,0.944193,0.000407,0.4763


Random Forest

In [49]:
# %%time
# model = RandomForestClassifier(n_estimators= 100, max_depth=20, n_jobs=-1)
# model_label = 'Bigram_RF'


# model_score(model, X_train_bi, X_test_bi, y_train, y_test, score_df, model_label)
# cv_score(model, X_train_bi, y_train, model_label)
# score_df

In [50]:
# score_df

### Tri-gram

In [51]:
%%time
X_train_tri, X_test_tri=  vect_trans(CountVectorizer(ngram_range=(1,3), stop_words=stopwords), X_train, X_test)

Wall time: 1min 36s


Naive Bayes

In [52]:
model = MultinomialNB()
model_label = 'Trigram_NB'
X_train_arg = X_train_tri
X_test_arg = X_test_tri

model_score(model, X_train_arg, X_test_arg, y_train, y_test, score_df, model_label)
cv_score(model, X_train_arg, y_train, model_label)
score_df

Train Accuracy : 0.9877683645521447
Test Accuracy: 0.9445780033136211
Test F1 score: 0.26895576651181574
Test Accuracy Mean: 0.9444165985332346
Test Accuracy STD: 0.00039801435185916374
Test F1: 0.5027383149124014


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936925,0.553019,0.932164,0.00045,0.526594
Bigram_NB,0.94693,0.393349,0.944193,0.000407,0.4763
Trigram_NB,0.944578,0.268956,0.944417,0.000398,0.502738


Random Forest

In [53]:
# %%time
# model = RandomForestClassifier(n_estimators= 100, max_depth=20, n_jobs=-1)
# model_label = 'Trigram_RF'

# model_score(model, X_train_tri, X_test_tri, y_train, y_test, score_df, model_label)
# cv_score(model, X_train_tri, y_train, model_label)


In [54]:
# score_df

### GridSearch min/max df

In [55]:
# params = {'CV__max_df':(1.0, 0.9),
#        'CV__min_df': (1, 2, 0.01, 0.02),
#         'CV__ngram_range':((1,1), (1,2), (1,3))}

# gridcv(pipeCVNB, X_train, y_train, params)

In [56]:
# %%time
# X_train_t, X_test_t=  vect_trans(CountVectorizer(max_df=1.0, min_df=1, ngram_range=(1,2), 
#                                                     stop_words=stopwords), X_train, X_test)

In [57]:
# model = MultinomialNB()
# model_label = 'Grid_DFNB'

# model_score(model, X_train_t, X_test_t, y_train, y_test, score_df, model_label)
# cv_score(model, X_train_t, y_train, model_label)
# score_df

### TFIDF

In [58]:
X_train_tf_t, X_test_tf_t = vect_trans(TfidfTransformer(), X_train_t, X_test_t)

In [59]:
model = MultinomialNB()
model_label = 'Tfidf_t_NB'

model_score(model, X_train_tf_t , X_test_tf_t, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tf_t, y_train, model_label)
score_df

Train Accuracy : 0.9425709301126695
Test Accuracy: 0.9411265699121982
Test F1 score: 0.1277676950998185
Test Accuracy Mean: 0.939962699105968
Test Accuracy STD: 0.0001597900786319155
Test F1: 0.09666475487668197


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936925,0.553019,0.932164,0.00045,0.526594
Bigram_NB,0.94693,0.393349,0.944193,0.000407,0.4763
Trigram_NB,0.944578,0.268956,0.944417,0.000398,0.502738
Tfidf_t_NB,0.941127,0.127768,0.939963,0.00016,0.096665


In [60]:
nb = MultinomialNB()
nb.fit(X_train_tf_t, y_train)  
test_score =  nb.score(X_test_tf_t, y_test)
print('train score:', nb.score(X_train_tf_t, y_train))
print('test score:', test_score)
y_pred = nb.predict(X_test_tf_t)

train score: 0.9425709301126695
test score: 0.9411265699121982


In [61]:
print(f1_score(y_test, y_pred) )
print(f1_score(y_test, y_pred, average='macro') )
print(f1_score(y_test, y_pred, average='micro') )
print(f1_score(y_test, y_pred, average='weighted') )
confusion_matrix(y_test, y_pred)

0.1277676950998185
0.5486514150832124
0.9411265699121982
0.9174536249200995


array([[305899,    429],
       [ 18795,   1408]], dtype=int64)

### NLTK Best Bigrams

In [62]:
# recreate text using ngrams
def ngram_to_corpus(data, ngram_list, n):
#     ngram_list = set({('let', 'us'), ('as', 'soon')})  # {('let', 'us'), ('as', 'soon')}
#     tokens = ['please', 'let', 'us', 'know', 'as', 'soon', 'as', 'possible']
    new_data = []
    for text in data:
        tokens = nltk.word_tokenize(text)
        output = []
        q_iter = iter(range(len(tokens)))
        
        for idx in q_iter:
            output.append(tokens[idx])
            if n == 2:
                if idx < (len(tokens) - 1) and (tokens[idx], tokens[idx+1]) in ngram_list:
                    output[-1] += '_' + tokens[idx+1]
                    next(q_iter)
            elif n == 3:
                if idx < (len(tokens) - 2) and (tokens[idx], tokens[idx+1], tokens[idx+2] ) in ngram_list:
                    output[-1] += '_' + tokens[idx+1] + '_' + tokens[idx+2]
                    next(q_iter)
                    next(q_iter)
        new_data.append( ' '.join(output))

    return new_data

In [63]:
%%time
# create one list of all question tokens
full_text = []

for text in X_train:
    full_text += [w for w in nltk.word_tokenize(text) if w not in stopwords]

Wall time: 1min 48s


In [64]:
len(full_text)

6223102

In [65]:
if 'would' in stopwords:
    print(True)

In [66]:
%%time
# create bigram vocabulary
bigram_measures = collocations.BigramAssocMeasures()

finder = nltk.BigramCollocationFinder.from_words(full_text)
# scored = finder.score_ngrams( bigram_measures.likelihood_ratio  )
bigram_vocab = finder.nbest(bigram_measures.likelihood_ratio, 80)
print(bigram_vocab)

[('united', 'states'), ('best', 'way'), ('donald', 'trump'), ('year', 'old'), ('computer', 'science'), ('even', 'though'), ('high', 'school'), ('would', 'happen'), ('social', 'media'), ('north', 'korea'), ('pros', 'cons'), ('get', 'rid'), ('major', 'accomplishments'), ('jee', 'mains'), ('look', 'like'), ('would', 'win'), ('new', 'york'), ('machine', 'learning'), ('harry', 'potter'), ('years', 'old'), ('real', 'estate'), ('long', 'take'), ('saudi', 'arabia'), ('feel', 'like'), ('star', 'wars'), ('ssc', 'cgl'), ('mechanical', 'engineering'), ('elon', 'musk'), ('tv', 'show'), ('hillary', 'clinton'), ('hong', 'kong'), ('tamil', 'nadu'), ('president', 'trump'), ('useful', 'tips'), ('san', 'francisco'), ('different', 'types'), ('hotels', 'short-term'), ('artificial', 'intelligence'), ('prime', 'minister'), ('years', 'ago'), ('literary', 'devices'), ('tv', 'series'), ('credit', 'card'), ('narendra', 'modi'), ('many', 'people'), ('digital', 'marketing'), ('new', 'zealand'), ('los', 'angeles'),

In [67]:
%%time
# create bigram vocabulary
bigram_measures = collocations.BigramAssocMeasures()


finder3 = nltk.BigramCollocationFinder.from_words(full_text)
finder3.apply_freq_filter(10)
finder3.apply_word_filter(lambda x: x in stopwords)
best_pmi = finder3.nbest(bigram_measures.pmi, 200)
print(best_pmi)

[('jiu', 'jitsu'), ('muhoozi', 'kainerugaba'), ('neman', 'ashraf'), ('roald', 'dahl'), ('rudyard', 'kipling'), ('michio', 'kaku'), ('avada', 'kedavra'), ('aam', 'aadmi'), ('buenos', 'aires'), ('jaggi', 'vasudev'), ('disha', 'patani'), ('deng', 'xiaoping'), ('ronda', 'rousey'), ('abercrombie', 'fitch'), ('zaira', 'wasim'), ('endoplasmic', 'reticulum'), ('nathuram', 'godse'), ('sushma', 'swaraj'), ('jiang', 'zemin'), ('vande', 'mataram'), ('meryl', 'streep'), ('pakatan', 'harapan'), ('asim', 'qureshi'), ('sylvia', 'plath'), ('lata', 'mangeshkar'), ('dima', 'vorobiev'), ('kalpit', 'veerwal'), ('sindhu', 'satish'), ('pradhan', 'mantri'), ('aldous', 'huxley'), ('narsee', 'monjee'), ('ulcerative', 'colitis'), ('gauri', 'lankesh'), ('hadron', 'collider'), ('ballon', "d'or"), ('mitt', 'romney'), ('petyr', 'baelish'), ('shel', 'silverstein'), ('khaled', 'hosseini'), ('sourav', 'ganguly'), ('satoshi', 'nakamoto'), ('tubal', 'ligation'), ('satya', 'nadella'), ('agatha', 'christie'), ('klux', 'kla

In [68]:
%%time
# create trigram vocabulary
trigram_measures = collocations.TrigramAssocMeasures()
finder = nltk.TrigramCollocationFinder.from_words(full_text)
trigram_vocab = finder.nbest(trigram_measures.likelihood_ratio, 20)
print(trigram_vocab)

[('united', 'states', 'america'), ('president', 'united', 'states'), ('united', 'states', 'india'), ('states', 'united', 'states'), ('history', 'united', 'states'), ('united', 'states', 'constitution'), ('united', 'states', 'government'), ('united', 'states', 'matter'), ('united', 'states', 'like'), ('united', 'states', 'army'), ('united', 'states', 'us'), ('united', 'states', 'usa'), ('united', 'states', 'united'), ('south', 'united', 'states'), ('coast', 'united', 'states'), ('canada', 'united', 'states'), ('united', 'states', 'military'), ('united', 'states', 'marine'), ('outside', 'united', 'states'), ('happen', 'united', 'states')]
Wall time: 7min 25s


In [69]:
%%time
# create text with bigram replacement
train['bigram_question_lkhd'] = ngram_to_corpus(clean_questions, bigram_vocab, 2)

Wall time: 2min 4s


In [70]:
%%time
# create text with both tri and bigram in text, by applying trigram first
train['trigram_question_lkhd'] = ngram_to_corpus(clean_questions, trigram_vocab, 3)
train['trigram_question_lkhd'] = ngram_to_corpus(train['trigram_question_lkhd'], bigram_vocab, 2)

Wall time: 3min 55s


In [71]:
train['trigram_question_lkhd'][20:25].values

array(['know whether girl done sex sex',
       'become fast learner professional career personal life',
       'united_states become largest dictatorship world',
       'strangest phenomenon know witnessed generated area electronics explanation terms modern physics',
       'leave friends find new ones'], dtype=object)

In [72]:
X = train[['bigram_question_lkhd','trigram_question_lkhd']]
y = train.target

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=495, test_size=0.2)

In [74]:
len(X_train)

1044897

In [75]:
X_train.head()

Unnamed: 0,bigram_question_lkhd,trigram_question_lkhd
1077331,procedure officially changing name india,procedure officially changing name india
334276,ancient egypt polytheism,ancient egypt polytheism
620299,whenever put blood pressure monitor get scared...,whenever put blood pressure monitor get scared...
1098236,ego react suicide,ego react suicide
548923,join tcs fresher missed campus hiring,join tcs fresher missed campus hiring


#### Bigram Model

In [76]:
# model using bigram text
X_train_bi, X_test_bi =  vect_trans(CountVectorizer(max_df=1.0,  min_df=1, ngram_range=(1,1), stop_words=stopwords),
                                   X_train.bigram_question_lkhd, X_test.bigram_question_lkhd,)



In [77]:
model = MultinomialNB()
model_label = 'Bigram_best_NB'

model_score(model, X_train_bi, X_test_bi, y_train, y_test, score_df, model_label)
cv_score(model, X_train_bi, y_train, model_label)
score_df

Train Accuracy : 0.938080978316523
Test Accuracy: 0.9373030912049
Test F1 score: 0.5532216705766818
Test Accuracy Mean: 0.9328144311909551
Test Accuracy STD: 0.00045544676447581347
Test F1: 0.5289402582650158


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936925,0.553019,0.932164,0.00045,0.526594
Bigram_NB,0.94693,0.393349,0.944193,0.000407,0.4763
Trigram_NB,0.944578,0.268956,0.944417,0.000398,0.502738
Tfidf_t_NB,0.941127,0.127768,0.939963,0.00016,0.096665
Bigram_best_NB,0.937303,0.553222,0.932814,0.000455,0.52894


#### Trigram Model

In [78]:
# model using bigram text
X_train_tri, X_test_tri =  vect_trans(CountVectorizer(max_df=1.0,  min_df=1, ngram_range=(1,1), stop_words=stopwords),
                                   X_train.trigram_question_lkhd, X_test.trigram_question_lkhd)



In [79]:
model = MultinomialNB()
model_label = 'Trigram_best_NB'

model_score(model, X_train_tri, X_test_tri, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tri, y_train, model_label)
score_df

Train Accuracy : 0.9380943767663225
Test Accuracy: 0.9373069193224232
Test F1 score: 0.5531880064387635
Test Accuracy Mean: 0.9328211303966206
Test Accuracy STD: 0.00045313500457456573
Test F1: 0.5289273823841426


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936925,0.553019,0.932164,0.00045,0.526594
Bigram_NB,0.94693,0.393349,0.944193,0.000407,0.4763
Trigram_NB,0.944578,0.268956,0.944417,0.000398,0.502738
Tfidf_t_NB,0.941127,0.127768,0.939963,0.00016,0.096665
Bigram_best_NB,0.937303,0.553222,0.932814,0.000455,0.52894
Trigram_best_NB,0.937307,0.553188,0.932821,0.000453,0.528927


# Regenerate bi/trigrams after replacing previous ngrams

In [80]:
%%time
full_text_ngrams = []

for text in X_train.trigram_question_lkhd:
    full_text_ngrams += [w for w in nltk.word_tokenize(text) if w not in stopwords]

Wall time: 1min 39s


In [81]:
%%time
# create bigram vocabulary
bigram_measures = collocations.BigramAssocMeasures()

finder = nltk.BigramCollocationFinder.from_words(full_text_ngrams)
# scored = finder.score_ngrams( bigram_measures.likelihood_ratio  )
bigram_vocab = finder.nbest(bigram_measures.likelihood_ratio, 40)
print(bigram_vocab)

[('would_win', 'fight'), ('useful_tips', 'someone'), ('hotels_short-term', 'business'), ('many', 'times'), ('bad', 'neighborhoods'), ('business', 'travelers'), ('world', 'war'), ('much', 'cost'), ('advice', 'would'), ('good', 'idea'), ('supreme', 'court'), ('civil', 'war'), ('bits', 'pilani'), ('video', 'games'), ('las', 'vegas'), ('th', 'century'), ('mechanical', 'engineer'), ('bank', 'account'), ('best', 'friend'), ('personality', 'disorder'), ('data', 'science'), ('gordon', 'miller'), ('much', 'time'), ('first', 'time'), ('mutual', 'funds'), ('would', 'recommend'), ('good', 'hotels_short-term'), ('soviet', 'union'), ('best', 'place'), ('lesser', 'known'), ('electrical', 'engineering'), ('questions', 'asked'), ('sri', 'lanka'), ('someone', 'starting'), ('starting', 'work'), ("'ve", 'ever'), ('mental', 'health'), ('acting', 'style'), ('twin', 'flame'), ('rahul', 'gandhi')]
Wall time: 44.5 s


In [82]:
%%time
# create trigram vocabulary
trigram_measures = collocations.TrigramAssocMeasures()
finder = nltk.TrigramCollocationFinder.from_words(full_text_ngrams)
finder.apply_freq_filter(100)
trigram_vocab = finder.nbest(trigram_measures.pmi, 20)
print(trigram_vocab)

[('kim', 'jong', 'un'), ('controversial', 'events', 'mentioned'), ('borderline', 'personality', 'disorder'), ('hotels_short-term', 'business', 'travelers'), ('take_consideration', 'writing', 'biography'), ('rbi', 'grade', 'b'), ('avengers', 'infinity', 'war'), ('lesser', 'known', 'facts'), ('manufacturing', 'process', 'improved'), ('fifa', 'world', 'cup'), ('tips', 'write', 'summary'), ('characters', 'change', 'throughout'), ('useful_tips', 'someone', 'starting'), ('student', 'organizations', 'join'), ('writing', 'style', 'structure'), ('useful_tips', 'students', 'starting'), ('starting', 'first', 'semester'), ('download', 'test', 'bank'), ('things', 'weekends', 'student'), ('world', 'war', 'ii')]
Wall time: 30.8 s


In [83]:
%%time
# create text with bigram replacement
train['bigram_question_2'] = ngram_to_corpus(clean_questions, bigram_vocab, 2)

Wall time: 2min 3s


In [84]:
%%time
# create text with both tri and bigram in text, by applying trigram first
train['trigram_question_2'] = ngram_to_corpus(clean_questions, trigram_vocab, 3)
train['trigram_question_2'] = ngram_to_corpus(train['trigram_question_2'], bigram_vocab, 2)

Wall time: 3min 55s


In [85]:
X = train[['bigram_question_2','trigram_question_2']]
y = train.target

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=495, test_size=0.2)

In [87]:
len(X_train)

1044897

#### Bigram Model

In [88]:
# model using bigram text
X_train_bi, X_test_bi =  vect_trans(CountVectorizer(max_df=1.0,  min_df=1, ngram_range=(1,1), stop_words=stopwords),
                                   X_train.bigram_question_2, X_test.bigram_question_2,)



In [89]:
model = MultinomialNB()
model_label = 'Bigram_best_NB2'

model_score(model, X_train_bi, X_test_bi, y_train, y_test, score_df, model_label)
cv_score(model, X_train_bi, y_train, model_label)
score_df

Train Accuracy : 0.9374627355614955
Test Accuracy: 0.9365144989951192
Test F1 score: 0.5515898767034393
Test Accuracy Mean: 0.9322583954308508
Test Accuracy STD: 0.00044042110104467624
Test F1: 0.5289119949539701


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936925,0.553019,0.932164,0.00045,0.526594
Bigram_NB,0.94693,0.393349,0.944193,0.000407,0.4763
Trigram_NB,0.944578,0.268956,0.944417,0.000398,0.502738
Tfidf_t_NB,0.941127,0.127768,0.939963,0.00016,0.096665
Bigram_best_NB,0.937303,0.553222,0.932814,0.000455,0.52894
Trigram_best_NB,0.937307,0.553188,0.932821,0.000453,0.528927
Bigram_best_NB2,0.936514,0.55159,0.932258,0.00044,0.528912


#### Trigram Model

In [90]:
# model using bigram text
X_train_tri, X_test_tri =  vect_trans(CountVectorizer(max_df=1.0,  min_df=1, ngram_range=(1,1), stop_words=stopwords),
                                   X_train.trigram_question_2, X_test.trigram_question_2)



In [91]:
model = MultinomialNB()
model_label = 'Trigram_best_NB2'

model_score(model, X_train_tri, X_test_tri, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tri, y_train, model_label)
score_df

Train Accuracy : 0.9374895324610942
Test Accuracy: 0.936529811465212
Test F1 score: 0.5515525262360705
Test Accuracy Mean: 0.9322995477858168
Test Accuracy STD: 0.0004353257097245323
Test F1: 0.5290324660119806


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936925,0.553019,0.932164,0.00045,0.526594
Bigram_NB,0.94693,0.393349,0.944193,0.000407,0.4763
Trigram_NB,0.944578,0.268956,0.944417,0.000398,0.502738
Tfidf_t_NB,0.941127,0.127768,0.939963,0.00016,0.096665
Bigram_best_NB,0.937303,0.553222,0.932814,0.000455,0.52894
Trigram_best_NB,0.937307,0.553188,0.932821,0.000453,0.528927
Bigram_best_NB2,0.936514,0.55159,0.932258,0.00044,0.528912
Trigram_best_NB2,0.93653,0.551553,0.9323,0.000435,0.529032
