In [1]:
# text manipulation
import re
import string

# Data management
import pandas as pd
import numpy as np
from scipy.sparse import *
import scipy

# NLP
import nltk
import nltk.collocations as collocations
from nltk.tag import tnt
import spacy

# modelling
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import f1_score, confusion_matrix

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
train = pd.read_csv('./train.csv')

In [3]:
no_insincere = train[train['target']==1].target.count()
no_sincere = train[train['target']==0].target.count()

print('No. of insincere questions:', no_insincere)
print('No. of sincere questions:', no_sincere)
print('% of insincere questions:', train.target.mean())
print('Null score:', 1- train.target.mean())

No. of insincere questions: 80810
No. of sincere questions: 1225312
% of insincere questions: 0.06187017751787352
Null score: 0.9381298224821265


# Define functions and pipelines

In [4]:
def vect_trans(vectorizer, X_train, X_test):
    # can also take a transformer
    vect = vectorizer
    vect.fit(X_train)
    return vect.transform(X_train), vect.transform(X_test)

In [5]:
# MultinominalNB function for printing scores and storing into df.
def model_score(model, X_train, X_test, y_train, y_test, score_df, model_label):
    estimator = model
    estimator.fit(X_train, y_train)
    test_score =  estimator.score(X_test, y_test)
    f1 = f1_score(y_test, estimator.predict(X_test))
    
    print('Train Accuracy :', estimator.score(X_train, y_train))
    print('Test Accuracy:', test_score)
    print('Test F1 score:', f1)
    score_df.loc[model_label, 'Test_Accuracy'] = test_score
    score_df.loc[model_label, 'Test_F1_score'] = f1

In [6]:
# Cross Validate function for printing scores and storing into df.
def cv_score(model, X, y, model_label,  cv=5, ):    
    
    # instantiating model
    estimator = model
    
    cv_result = cross_validate(estimator, X, y, cv = cv, n_jobs=-1, scoring=['accuracy', 'f1'])
    
    print('Test Accuracy Mean:',cv_result['test_accuracy'].mean())
    print('Test Accuracy STD:',cv_result['test_accuracy'].std())
    print('Test F1:', cv_result['test_f1'].mean())
    score_df.loc[model_label, 'CV_Accuracy'] = cv_result['test_accuracy'].mean()
    score_df.loc[model_label, 'CV_Acc_STD'] = cv_result['test_accuracy'].std()
    score_df.loc[model_label, 'CV_F1_score'] = cv_result['test_f1'].mean()

In [7]:
# GridSearchCV function, auto display best score and parameters and storing in df
def gridcv(model, X, y, params, cv= 5 ):
    
    # instantiating model can also be a pipeline
    estimator = model
    
    gridcv = GridSearchCV(estimator=estimator, param_grid=params, cv = cv, verbose=10, n_jobs=6)
    gridcv.fit(X, y)
    
    print(gridcv.best_params_)
    print(gridcv.best_score_)
    

In [8]:
# CountVectorizer pipeline and parameters
pipeCVNB = Pipeline([('CV',CountVectorizer(stop_words=stopwords)), 
                    ('NB',MultinomialNB())])

paramsCVNB = {'CV__max_df':(1.0, 0.9, 0.8, 0.7),
       'CV__min_df': (1, 2, 0.01 , 0.1, 0.2),
         'CV__ngram_range':((1,1), (1,2), (1,3))}

NameError: name 'stopwords' is not defined

In [9]:
# TfidfVectorizer pipeline and parameters
pipeTVNB = Pipeline([('TV',TfidfVectorizer(stop_words=stopwords)), 
                    ('NB',MultinomialNB())])

paramsTVNB = {'TV__max_df':(1.0, 0.9, 0.8, 0.7, 0.6),
       'TV__min_df': (1, 2, 0.01, 0.05, 0.1),
         'TV__ngram_range':((1,1), (1,2), (1,3), (2,2), (2,3))}

NameError: name 'stopwords' is not defined

In [10]:
score_df = pd.DataFrame()

# Lemmatization

In [14]:
# using spaCy to lemmatize using POS tags in one step, with out converting between WordNet and Treebank tags, using NLTK
spac = spacy.load('en', disable=['parser', 'ner'])
def lemmatizer(text):
    text = spac(text)
    return ' '.join([token.lemma_ for token in text if token.lemma_ not in stopwords])

In [23]:
# %%time
# lemma_q = [lemmatizer(q) for q in train.question_text]

Wall time: 48min 33s


In [37]:
# lemma_train = pd.DataFrame(lemma_q, columns = ['question_text'])
# lemma_train['target'] = train.target
# lemma_train.head()

Unnamed: 0,question_text,target
0,quebec nationalist see -PRON- province nation ...,0
1,-PRON- adopt dog would -PRON- encourage people...,0
2,velocity affect time velocity affect space geo...,0
3,otto von guericke use magdeburg hemisphere,0
4,-PRON- convert montra helicon mountain bike ch...,0
5,gaza slowly become auschwitz dachau treblinka ...,0
6,quora automatically ban conservative opinion r...,0
7,-PRON- crazy -PRON- wash wipe -PRON- grocery g...,0
8,thing dress moderately different dress modestly,0
9,-PRON- -PRON- -PRON- ever phase wherein -PRON-...,0


In [61]:
# lemma_train.to_pickle('./lemma_train.pkl')

In [12]:
lemma_train = pd.read_pickle('./lemma_train.pkl')

In [13]:
%%time
# remove digits
clean_questions = [''.join(c for c in q if not c.isdigit()) for q in lemma_train.question_text]

Wall time: 6.2 s


In [11]:
stopwords = list(nltk.corpus.stopwords.words('english')) + list(string.punctuation) + ["''", '``','’','“','”', "'s", "'d", "'ll", "'t", "n't", "ca", 'wo']

In [14]:
%%time
# remove stop words and lower all characters
clean_questions = [' '.join(w for w in nltk.word_tokenize(q.lower()) if w not in stopwords) for q in clean_questions]

Wall time: 2min 14s


In [15]:
# replacing all identity labels each question with a common labels
def labels_to_question(data, label_list, label_type):
         
    new_data = []
    
    # For every questions
    i_data = 0
    for i_data in range(len(data)):
        question = data[i_data].lower()
        output = []
        
        # compare each label to question
        for label in label_list:

            if label in question:

                que_t = nltk.word_tokenize(question)
                lab_t = nltk.word_tokenize(label)

                i_que = 0
                while i_que < len(que_t):
                    i_lab = 0
                    
                    # If current token is same as first label token, continue compare rest of the tokens. 
                    if que_t[i_que] == lab_t[0]:
                        que_t[i_que] = label_type
                        i_lab += 1
                        i_que += 1

                        # Remove trailing question tokens if they match trailing label tokens
                        while i_lab < len(lab_t):
                            if que_t[i_que] == lab_t[i_lab]:
                                que_t.pop(i_que)
                                i_lab += 1
                            else:
                                break
#                     elif que_t[i_que] == lab_t[0]:
#                         print('Question: ',question, i_data)
#                         print('label: ', label)
                    i_que += 1
                question = ' '.join(que_t)
#                 print('after: ', question)
#                 print('label: ', label)
        new_data.append(question)                   
        if i_data % 1000 == 0:
            clear_output(wait=True)
            display(i_data)
    return new_data


In [16]:
f = open('nationalities.txt', 'r')
nationalities = []
for n in f:
    nationalities.append(n.strip().lower())
f.close()
nationalities = set(nationalities)

In [17]:
# Identity groupd filters, created from online lists, most frequent insincere words and manual editing.
ID_filter = pd.read_csv('ID_filter.csv')

In [18]:
ID_filter.head(2)

Unnamed: 0,RELIGIOUS_ID,RACIAL_ID,NATIONAL_ID,NATIONALITY_ID,GENDER_ID,Unnamed: 5,Political_groups,Political_figure
0,buddhist,white people,Afghanistan,Afghans,girls,,trump supporters,donald trump
1,catholic,black people,Albania,Albanians,boys,,democrate,president trump


In [19]:
religious_ID = ID_filter.RELIGIOUS_ID.dropna()
racial_ID = ID_filter.RACIAL_ID.dropna()
national_ID = ID_filter.NATIONAL_ID.dropna()
nationality_ID = ID_filter.NATIONALITY_ID.dropna()



In [20]:
%%time
clean_questions = labels_to_question(clean_questions, religious_ID, 'RELIGIOUS_ID')
clean_questions = labels_to_question(clean_questions, racial_ID, 'RACIAL_ID')
clean_questions = labels_to_question(clean_questions, national_ID, 'NATIONAL_ID')
clean_questions = labels_to_question(clean_questions, nationality_ID, 'NATIONALITY_ID')

NameError: name 'clear_output' is not defined

In [21]:
X = clean_questions
y = lemma_train.target

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 495)

# Default count vectorizer on raw text

In [23]:
%%time

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(train.question_text, train.target,
                                                                    stratify=train.target, random_state = 495)

X_train_raw_t, X_test_raw_t=  vect_trans(CountVectorizer(), X_train_raw, X_test_raw)

Wall time: 29.5 s


In [24]:
model = MultinomialNB()
model_label = 'Raw_token_NB'

model_score(model, X_train_raw_t, X_test_raw_t, y_train_raw, y_test_raw, score_df, model_label)
cv_score(model, X_train_raw_t, y_train_raw, model_label)
score_df

Train Accuracy : 0.9350739237089765
Test Accuracy: 0.9344135778838762
Test F1 score: 0.5646092542896641
Test Accuracy Mean: 0.9321614838567932
Test Accuracy STD: 0.0005228593021778298
Test F1: 0.5489215714930169


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922


In [25]:
nb = MultinomialNB()
nb.fit(X_train_raw_t, y_train_raw)  
test_score =  nb.score(X_test_raw_t, y_test_raw)
print('train score:', nb.score(X_train_raw_t, y_train_raw))
print('test score:', test_score)
y_pred = nb.predict(X_test_raw_t)

train score: 0.9350739237089765
test score: 0.9344135778838762


In [26]:
print(f1_score(y_test_raw, y_pred) )
print(f1_score(y_test_raw, y_pred, average='macro') )
print(f1_score(y_test_raw, y_pred, average='micro') )
print(f1_score(y_test_raw, y_pred, average='weighted') )
confusion_matrix(y_test_raw, y_pred)

0.5646092542896641
0.7645724512273393
0.9344135778838762
0.9397915566837656


array([[291229,  15099],
       [  6317,  13886]], dtype=int64)

# Basic Token and Ngram modelling on cleaned data

Creating train/test sets

In [27]:
X = clean_questions
y = train.target

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 495)

### Tokens only

In [29]:
%%time
X_train_t, X_test_t=  vect_trans(CountVectorizer(max_df=1.0, min_df=1, ngram_range=(1,1)), X_train, X_test)

Wall time: 19.5 s


In [30]:
model = MultinomialNB()
model_label = 'Token_NB'
X_train_arg = X_train_t
X_test_arg = X_test_t

model_score(model, X_train_arg, X_test_arg, y_train, y_test, score_df, model_label)
cv_score(model, X_train_arg, y_train, model_label)
score_df

Train Accuracy : 0.9363397581235434
Test Accuracy: 0.9363429505927462
Test F1 score: 0.5443665059184568
Test Accuracy Mean: 0.9314601699428223
Test Accuracy STD: 0.000542872136791639
Test F1: 0.516455592315815


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936343,0.544367,0.93146,0.000543,0.516456


### All Bigrams

In [31]:
%%time
X_train_bi, X_test_bi=  vect_trans(CountVectorizer(ngram_range=(1,2)), X_train, X_test)

Wall time: 52.1 s


Naive Bayes

In [32]:
model = MultinomialNB()
model_label = 'Bigram_NB'
X_train_arg = X_train_bi
X_test_arg = X_test_bi

model_score(model, X_train_arg, X_test_arg, y_train, y_test, score_df, model_label)
cv_score(model, X_train_arg, y_train, model_label)
score_df

Train Accuracy : 0.9642411986226905
Test Accuracy: 0.9461031265025373
Test F1 score: 0.4236072446205745
Test Accuracy Mean: 0.9378138426556836
Test Accuracy STD: 0.00017382141980152566
Test F1: 0.4507294031911434


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936343,0.544367,0.93146,0.000543,0.516456
Bigram_NB,0.946103,0.423607,0.937814,0.000174,0.450729


Random Forest

In [33]:
# %%time
# model = RandomForestClassifier(n_estimators= 100, max_depth=20, n_jobs=-1)
# model_label = 'Bigram_RF'


# model_score(model, X_train_bi, X_test_bi, y_train, y_test, score_df, model_label)
# cv_score(model, X_train_bi, y_train, model_label)
# score_df

In [34]:
# score_df

### Tri-gram

In [35]:
%%time
X_train_tri, X_test_tri=  vect_trans(CountVectorizer(ngram_range=(1,3), stop_words=stopwords), X_train, X_test)

Wall time: 1min 28s


Naive Bayes

In [36]:
model = MultinomialNB()
model_label = 'Trigram_NB'
X_train_arg = X_train_tri
X_test_arg = X_test_tri

model_score(model, X_train_arg, X_test_arg, y_train, y_test, score_df, model_label)
cv_score(model, X_train_arg, y_train, model_label)
score_df

Train Accuracy : 0.9808746711637816
Test Accuracy: 0.9442441912100229
Test F1 score: 0.28170125463584
Test Accuracy Mean: 0.9393644896978772
Test Accuracy STD: 0.0003049995625848121
Test F1: 0.46121678736240074


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936343,0.544367,0.93146,0.000543,0.516456
Bigram_NB,0.946103,0.423607,0.937814,0.000174,0.450729
Trigram_NB,0.944244,0.281701,0.939364,0.000305,0.461217


Random Forest

In [37]:
# %%time
# model = RandomForestClassifier(n_estimators= 100, max_depth=20, n_jobs=-1)
# model_label = 'Trigram_RF'

# model_score(model, X_train_tri, X_test_tri, y_train, y_test, score_df, model_label)
# cv_score(model, X_train_tri, y_train, model_label)


In [38]:
# score_df

### GridSearch min/max df

In [39]:
# params = {'CV__max_df':(1.0, 0.9),
#        'CV__min_df': (1, 2, 0.01, 0.02),
#         'CV__ngram_range':((1,1), (1,2), (1,3))}

# gridcv(pipeCVNB, X_train, y_train, params)

In [40]:
# %%time
# X_train_t, X_test_t=  vect_trans(CountVectorizer(max_df=1.0, min_df=1, ngram_range=(1,2), 
#                                                     stop_words=stopwords), X_train, X_test)

In [41]:
# model = MultinomialNB()
# model_label = 'Grid_DFNB'

# model_score(model, X_train_t, X_test_t, y_train, y_test, score_df, model_label)
# cv_score(model, X_train_t, y_train, model_label)
# score_df

### TFIDF

In [42]:
X_train_tf_t, X_test_tf_t = vect_trans(TfidfTransformer(), X_train_t, X_test_t)

In [43]:
model = MultinomialNB()
model_label = 'Tfidf_t_NB'

model_score(model, X_train_tf_t , X_test_tf_t, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tf_t, y_train, model_label)
score_df

Train Accuracy : 0.9422473256695907
Test Accuracy: 0.9411357574012881
Test F1 score: 0.13007467752885268
Test Accuracy Mean: 0.9398626572621417
Test Accuracy STD: 9.97902774433833e-05
Test F1: 0.09848172230931387


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936343,0.544367,0.93146,0.000543,0.516456
Bigram_NB,0.946103,0.423607,0.937814,0.000174,0.450729
Trigram_NB,0.944244,0.281701,0.939364,0.000305,0.461217
Tfidf_t_NB,0.941136,0.130075,0.939863,0.0001,0.098482


In [44]:
nb = MultinomialNB()
nb.fit(X_train_tf_t, y_train)  
test_score =  nb.score(X_test_tf_t, y_test)
print('train score:', nb.score(X_train_tf_t, y_train))
print('test score:', test_score)
y_pred = nb.predict(X_test_tf_t)

train score: 0.9422473256695907
test score: 0.9411357574012881


In [45]:
print(f1_score(y_test, y_pred) )
print(f1_score(y_test, y_pred, average='macro') )
print(f1_score(y_test, y_pred, average='micro') )
print(f1_score(y_test, y_pred, average='weighted') )
confusion_matrix(y_test, y_pred)

0.13007467752885268
0.5498059558236386
0.9411357574012881
0.9175983308266111


array([[305873,    455],
       [ 18766,   1437]], dtype=int64)

### NLTK Best Bigrams

In [46]:
# recreate text using ngrams
def ngram_to_corpus(data, ngram_list, n):
#     ngram_list = set({('let', 'us'), ('as', 'soon')})  # {('let', 'us'), ('as', 'soon')}
#     tokens = ['please', 'let', 'us', 'know', 'as', 'soon', 'as', 'possible']
    new_data = []
    for text in data:
        tokens = nltk.word_tokenize(text)
        output = []
        q_iter = iter(range(len(tokens)))
        
        for idx in q_iter:
            output.append(tokens[idx])
            if n == 2:
                if idx < (len(tokens) - 1) and (tokens[idx], tokens[idx+1]) in ngram_list:
                    output[-1] += '_' + tokens[idx+1]
                    next(q_iter)
            elif n == 3:
                if idx < (len(tokens) - 2) and (tokens[idx], tokens[idx+1], tokens[idx+2] ) in ngram_list:
                    output[-1] += '_' + tokens[idx+1] + '_' + tokens[idx+2]
                    next(q_iter)
                    next(q_iter)
        new_data.append( ' '.join(output))

    return new_data

In [47]:
%%time
# create one list of all question tokens
full_text = []

for text in X_train:
    full_text += [w for w in nltk.word_tokenize(text) if w not in stopwords]

Wall time: 1min 42s


In [48]:
len(full_text)

7101029

In [49]:
if 'would' in stopwords:
    print(True)

In [50]:
%%time
# create bigram vocabulary
bigram_measures = collocations.BigramAssocMeasures()

finder = nltk.BigramCollocationFinder.from_words(full_text)
# scored = finder.score_ngrams( bigram_measures.likelihood_ratio  )
bigram_vocab = finder.nbest(bigram_measures.likelihood_ratio, 80)
print(bigram_vocab)

[('-pron-', '-pron-'), ('-pron-', 'get'), ('united', 'states'), ('year', 'old'), ('good', 'way'), ('donald', 'trump'), ('-pron-', 'think'), ('would', '-pron-'), ('-pron-', 'want'), ('-pron-', 'possible'), ('computer', 'science'), ('-pron-', 'find'), ('even', 'though'), ('north', 'korea'), ('high', 'school'), ('social', 'medium'), ('-pron-', 'feel'), ('-pron-', 'know'), ('would', 'happen'), ('get', 'rid'), ('major', 'accomplishment'), ('jee', 'mains'), ('look', 'like'), ('pro', 'con'), ('-pron-', 'ever'), ('-pron-', 'take'), ('tell', '-pron-'), ('new', 'york'), ('-pron-', 'need'), ('feel', 'like'), ('would', 'win'), ('tv', 'show'), ('harry', 'potter'), ('real', 'estate'), ('ssc', 'cgl'), ('saudi', 'arabia'), ('good', '-pron-'), ('star', 'wars'), ('mechanical', 'engineering'), ('good', 'place'), ('programming', 'language'), ('elon', 'musk'), ('hillary', 'clinton'), ('credit', 'card'), ('-pron-', 'favorite'), ('hong', 'kong'), ('mutual', 'fund'), ('tamil', 'nadu'), ('-pron-', 'true'), ('v

In [51]:
%%time
# create bigram vocabulary
bigram_measures = collocations.BigramAssocMeasures()


finder3 = nltk.BigramCollocationFinder.from_words(full_text)
finder3.apply_freq_filter(10)
finder3.apply_word_filter(lambda x: x in stopwords)
best_pmi = finder3.nbest(bigram_measures.pmi, 200)
print(best_pmi)

[('pepto', 'bismol'), ('muhoozi', 'kainerugaba'), ('avada', 'kedavra'), ('michio', 'kaku'), ('neman', 'ashraf'), ('roald', 'dahl'), ('aam', 'aadmi'), ('buenos', 'aires'), ('jaggi', 'vasudev'), ('disha', 'patani'), ('ronda', 'rousey'), ('deng', 'xiaoping'), ('abercrombie', 'fitch'), ('zaira', 'wasim'), ('endoplasmic', 'reticulum'), ('nathuram', 'godse'), ('sushma', 'swaraj'), ('jiang', 'zemin'), ('vande', 'mataram'), ('pakatan', 'harapan'), ('asim', 'qureshi'), ('lata', 'mangeshkar'), ('sylvia', 'plath'), ('kalpit', 'veerwal'), ('sindhu', 'satish'), ('meryl', 'streep'), ('looney', 'tunes'), ('pradhan', 'mantri'), ('aldous', 'huxley'), ('dima', 'vorobiev'), ('ulcerative', 'colitis'), ('narsee', 'monjee'), ('gauri', 'lankesh'), ('mosin', 'nagant'), ('sonu', 'nigam'), ('jiu', 'jitsu'), ('shel', 'silverstein'), ('mitt', 'romney'), ('khaled', 'hosseini'), ('petyr', 'baelish'), ('sourav', 'ganguly'), ('satoshi', 'nakamoto'), ('ballon', "d'or"), ('satya', 'nadella'), ('agatha', 'christie'), ('

In [52]:
%%time
# create trigram vocabulary
trigram_measures = collocations.TrigramAssocMeasures()
finder = nltk.TrigramCollocationFinder.from_words(full_text)
trigram_vocab = finder.nbest(trigram_measures.likelihood_ratio, 20)
print(trigram_vocab)

[('-pron-', '-pron-', '-pron-'), ('-pron-', '-pron-', 'get'), ('-pron-', '-pron-', 'think'), ('would', '-pron-', '-pron-'), ('-pron-', '-pron-', 'want'), ('-pron-', '-pron-', 'find'), ('-pron-', '-pron-', 'possible'), ('would', '-pron-', 'get'), ('-pron-', '-pron-', 'feel'), ('-pron-', '-pron-', 'know'), ('tell', '-pron-', '-pron-'), ('good', '-pron-', '-pron-'), ('-pron-', '-pron-', 'ever'), ('-pron-', '-pron-', 'take'), ('-pron-', '-pron-', 'need'), ('-pron-', 'get', '-pron-'), ('-pron-', '-pron-', 'see'), ('-pron-', 'get', 'rid'), ('-pron-', '-pron-', 'favorite'), ('-pron-', '-pron-', 'true')]
Wall time: 7min 39s


In [53]:
%%time
# create text with bigram replacement
train['bigram_question_lkhd'] = ngram_to_corpus(clean_questions, bigram_vocab, 2)

Wall time: 2min 21s


In [54]:
%%time
# create text with both tri and bigram in text, by applying trigram first
train['trigram_question_lkhd'] = ngram_to_corpus(clean_questions, trigram_vocab, 3)
train['trigram_question_lkhd'] = ngram_to_corpus(train['trigram_question_lkhd'], bigram_vocab, 2)

Wall time: 4min 15s


In [55]:
train['trigram_question_lkhd'][20:25].values

array(['-pron-_know whether girl sex sex -pron-',
       '-pron- become fast learner -pron- professional career -pron- personal life',
       'united_states become large dictatorship world',
       'strange phenomenon -pron-_know witness generate area electronic explanation term modern physics',
       '-pron- leave -pron- friend find new one'], dtype=object)

In [56]:
X = train[['bigram_question_lkhd','trigram_question_lkhd']]
y = train.target

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=495, test_size=0.2)

In [58]:
len(X_train)

1044897

In [59]:
X_train.head()

Unnamed: 0,bigram_question_lkhd,trigram_question_lkhd
1077331,procedure officially change name india,procedure officially change name india
334276,ancient egypt polytheism,ancient egypt polytheism
620299,whenever -pron- put blood pressure monitor -pr...,whenever -pron- put blood pressure monitor -pr...
1098236,ego react suicide,ego react suicide
548923,-pron- join tcs fresher -pron- miss campus hiring,-pron- join tcs fresher -pron- miss campus hiring


#### Bigram Model

In [60]:
# model using bigram text
X_train_bi, X_test_bi =  vect_trans(CountVectorizer(max_df=1.0,  min_df=1, ngram_range=(1,1), stop_words=stopwords),
                                   X_train.bigram_question_lkhd, X_test.bigram_question_lkhd,)



In [61]:
model = MultinomialNB()
model_label = 'Bigram_best_NB'

model_score(model, X_train_bi, X_test_bi, y_train, y_test, score_df, model_label)
cv_score(model, X_train_bi, y_train, model_label)
score_df

Train Accuracy : 0.936412871316503
Test Accuracy: 0.9361737965355537
Test F1 score: 0.5445157765332604
Test Accuracy Mean: 0.9316650352730174
Test Accuracy STD: 0.0003564139612948782
Test F1: 0.5184665706076068


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936343,0.544367,0.93146,0.000543,0.516456
Bigram_NB,0.946103,0.423607,0.937814,0.000174,0.450729
Trigram_NB,0.944244,0.281701,0.939364,0.000305,0.461217
Tfidf_t_NB,0.941136,0.130075,0.939863,0.0001,0.098482
Bigram_best_NB,0.936174,0.544516,0.931665,0.000356,0.518467


#### Trigram Model

In [62]:
# model using bigram text
X_train_tri, X_test_tri =  vect_trans(CountVectorizer(max_df=1.0,  min_df=1, ngram_range=(1,1), stop_words=stopwords),
                                   X_train.trigram_question_lkhd, X_test.trigram_question_lkhd)



In [63]:
model = MultinomialNB()
model_label = 'Trigram_best_NB'

model_score(model, X_train_tri, X_test_tri, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tri, y_train, model_label)
score_df

Train Accuracy : 0.9364157424128885
Test Accuracy: 0.9361852808881233
Test F1 score: 0.5446848027968972
Test Accuracy Mean: 0.9316525939460216
Test Accuracy STD: 0.0003618392672776735
Test F1: 0.5184340670776231


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936343,0.544367,0.93146,0.000543,0.516456
Bigram_NB,0.946103,0.423607,0.937814,0.000174,0.450729
Trigram_NB,0.944244,0.281701,0.939364,0.000305,0.461217
Tfidf_t_NB,0.941136,0.130075,0.939863,0.0001,0.098482
Bigram_best_NB,0.936174,0.544516,0.931665,0.000356,0.518467
Trigram_best_NB,0.936185,0.544685,0.931653,0.000362,0.518434


# Regenerate bi/trigrams after replacing previous ngrams

In [64]:
%%time
full_text_ngrams = []

for text in X_train.trigram_question_lkhd:
    full_text_ngrams += [w for w in nltk.word_tokenize(text) if w not in stopwords]

Wall time: 1min 47s


In [65]:
%%time
# create bigram vocabulary
bigram_measures = collocations.BigramAssocMeasures()

finder = nltk.BigramCollocationFinder.from_words(full_text_ngrams)
# scored = finder.score_ngrams( bigram_measures.likelihood_ratio  )
bigram_vocab = finder.nbest(bigram_measures.likelihood_ratio, 40)
print(bigram_vocab)

[('-pron-', '-pron-'), ('long', '-pron-_take'), ('good', '-pron-'), ('-pron-_feel', 'like'), ('-pron-', 'start'), ('-pron-', 'stop'), ('-pron-', 'mean'), ('south', 'korea'), ('-pron-', 'go'), ('would_win', 'fight'), ('good', 'book'), ('lose', 'weight'), ('trump', 'supporter'), ('-pron-', 'use'), ('civil', 'engineering'), ('kim', 'jong'), ('real', 'life'), ('advantage', 'disadvantage'), ('hotel', 'short_term'), ('fall', 'love'), ('-pron-', 'buy'), ('different', 'type'), ('useful_tip', 'someone'), ('police', 'officer'), ('mechanical', 'engineer'), ('commit', 'suicide'), ('international', 'student'), ('advice', 'would_-pron-'), ('much', 'money'), ('-pron-', 'get'), ('global', 'warming'), ('south', 'africa'), ('-pron-', 'prepare'), ('-pron-', 'life'), ('jong', 'un'), ('-pron-', 'opinion'), ('chance', 'get'), ('world', 'war'), ('personality', 'disorder'), ('silicon', 'valley')]
Wall time: 40.2 s


In [66]:
%%time
# create trigram vocabulary
trigram_measures = collocations.TrigramAssocMeasures()
finder = nltk.TrigramCollocationFinder.from_words(full_text_ngrams)
finder.apply_freq_filter(100)
trigram_vocab = finder.nbest(trigram_measures.pmi, 20)
print(trigram_vocab)

[('kim', 'jong', 'un'), ('borderline', 'personality', 'disorder'), ('controversial', 'event', 'mention'), ('short_term', 'business', 'traveler'), ('rbi', 'grade', 'b'), ('consideration', 'write', 'biography'), ('hotel', 'short_term', 'business'), ('fifa', 'world', 'cup'), ('writing', 'style', 'structure'), ('tip', 'write', 'summary'), ('manufacturing', 'process', 'improve'), ('-pron-_take', 'consideration', 'write'), ('character', 'change', 'throughout'), ('good', 'hotel', 'short_term'), ('world', 'war', 'ii'), ('download', 'test', 'bank'), ('useful_tip', 'someone', 'start'), ('-pron-', 'wan', 'na'), ('student', 'organization', 'join'), ('less', 'know', 'fact')]
Wall time: 35.7 s


In [67]:
%%time
# create text with bigram replacement
train['bigram_question_2'] = ngram_to_corpus(clean_questions, bigram_vocab, 2)

Wall time: 2min 8s


In [68]:
%%time
# create text with both tri and bigram in text, by applying trigram first
train['trigram_question_2'] = ngram_to_corpus(clean_questions, trigram_vocab, 3)
train['trigram_question_2'] = ngram_to_corpus(train['trigram_question_2'], bigram_vocab, 2)

Wall time: 4min 6s


In [69]:
X = train[['bigram_question_2','trigram_question_2']]
y = train.target

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=495, test_size=0.2)

In [71]:
len(X_train)

1044897

#### Bigram Model

In [72]:
# model using bigram text
X_train_bi, X_test_bi =  vect_trans(CountVectorizer(max_df=1.0,  min_df=1, ngram_range=(1,1), stop_words=stopwords),
                                   X_train.bigram_question_2, X_test.bigram_question_2,)



In [73]:
model = MultinomialNB()
model_label = 'Bigram_best_NB2'

model_score(model, X_train_bi, X_test_bi, y_train, y_test, score_df, model_label)
cv_score(model, X_train_bi, y_train, model_label)
score_df

Train Accuracy : 0.9361803125092713
Test Accuracy: 0.9359441094841612
Test F1 score: 0.5431761718856644
Test Accuracy Mean: 0.9314449178971914
Test Accuracy STD: 0.00025738958358852273
Test F1: 0.5185952596975845


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936343,0.544367,0.93146,0.000543,0.516456
Bigram_NB,0.946103,0.423607,0.937814,0.000174,0.450729
Trigram_NB,0.944244,0.281701,0.939364,0.000305,0.461217
Tfidf_t_NB,0.941136,0.130075,0.939863,0.0001,0.098482
Bigram_best_NB,0.936174,0.544516,0.931665,0.000356,0.518467
Trigram_best_NB,0.936185,0.544685,0.931653,0.000362,0.518434
Bigram_best_NB2,0.935944,0.543176,0.931445,0.000257,0.518595


#### Trigram Model

In [74]:
# model using bigram text
X_train_tri, X_test_tri =  vect_trans(CountVectorizer(max_df=1.0,  min_df=1, ngram_range=(1,1), stop_words=stopwords),
                                   X_train.trigram_question_2, X_test.trigram_question_2)



In [75]:
model = MultinomialNB()
model_label = 'Trigram_best_NB2'

model_score(model, X_train_tri, X_test_tri, y_train, y_test, score_df, model_label)
cv_score(model, X_train_tri, y_train, model_label)
score_df

Train Accuracy : 0.9362128516016411
Test Accuracy: 0.9359747344243469
Test F1 score: 0.5432698872170185
Test Accuracy Mean: 0.9314937264459875
Test Accuracy STD: 0.00023703741512209992
Test F1: 0.5186372571655607


Unnamed: 0,Test_Accuracy,Test_F1_score,CV_Accuracy,CV_Acc_STD,CV_F1_score
Raw_token_NB,0.934414,0.564609,0.932161,0.000523,0.548922
Token_NB,0.936343,0.544367,0.93146,0.000543,0.516456
Bigram_NB,0.946103,0.423607,0.937814,0.000174,0.450729
Trigram_NB,0.944244,0.281701,0.939364,0.000305,0.461217
Tfidf_t_NB,0.941136,0.130075,0.939863,0.0001,0.098482
Bigram_best_NB,0.936174,0.544516,0.931665,0.000356,0.518467
Trigram_best_NB,0.936185,0.544685,0.931653,0.000362,0.518434
Bigram_best_NB2,0.935944,0.543176,0.931445,0.000257,0.518595
Trigram_best_NB2,0.935975,0.54327,0.931494,0.000237,0.518637
