# Import packages

In [1]:
import pandas as pd
import numpy as np
import os

# fastext
import fasttext

from text_cleaning_utils import TextCleaningUtils
from sklearn.model_selection import train_test_split
from fast_text import FastTextUtils

from tabulate import tabulate

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('./data/train.csv')

In [3]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
df[df['target'] == 1].head()

Unnamed: 0,qid,question_text,target
22,0000e91571b60c2fb487,Has the United States become the largest dicta...,1
30,00013ceca3f624b09f42,Which babies are more sweeter to their parents...,1
110,0004a7fcb2bf73076489,If blacks support school choice and mandatory ...,1
114,00052793eaa287aff1e1,I am gay boy and I love my cousin (boy). He is...,1
115,000537213b01fd77b58a,Which races have the smallest penis?,1


In [5]:
df.shape

(1306122, 3)

# Text Cleaning

In [6]:
def clean_data(df,col_to_clean):
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.replace_contractions)
#   Remove Smiles and special chars
#   df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.transform_emojis)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_special_chars)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_spaces)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_punctuations)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_exaggerated_words)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_newlines)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_twitter_handles)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_web_links)
#   df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.replace_sign)
  df[col_to_clean] = df[col_to_clean].astype(str)
  df[col_to_clean] = df[col_to_clean].str.lower()
  return df

In [7]:
clean_df = clean_data(df,'question_text')

In [8]:
clean_df.to_csv('./data/train_clean.csv',index=False,encoding='utf-8')

In [9]:
clean_df.shape

(1306122, 3)

In [10]:
clean_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,how did quebec nationalists see their province...,0
1,000032939017120e6e44,do you have an adopted dog how would you enco...,0
2,0000412ca6e4628ce2cf,why does velocity affect time does velocity a...,0
3,000042bf85aa498cd78e,how did otto von guericke used the magdeburg h...,0
4,0000455dfa3e01eae3af,can i convert montra helicon d to a mountain b...,0


In [11]:
clean_df[clean_df['target']== 1].head()

Unnamed: 0,qid,question_text,target
22,0000e91571b60c2fb487,has the united states become the largest dicta...,1
30,00013ceca3f624b09f42,which babies are more sweeter to their parents...,1
110,0004a7fcb2bf73076489,if blacks support school choice and mandatory ...,1
114,00052793eaa287aff1e1,i am gay boy and i love my cousin boy he is s...,1
115,000537213b01fd77b58a,which races have the smallest penis,1


In [12]:
map_={0:'__label__0',
     1:'__label__1'}
clean_df['label'] = clean_df['target'].map(map_)

In [13]:
# train test split

In [14]:
clean_df.head()

Unnamed: 0,qid,question_text,target,label
0,00002165364db923c7e6,how did quebec nationalists see their province...,0,__label__0
1,000032939017120e6e44,do you have an adopted dog how would you enco...,0,__label__0
2,0000412ca6e4628ce2cf,why does velocity affect time does velocity a...,0,__label__0
3,000042bf85aa498cd78e,how did otto von guericke used the magdeburg h...,0,__label__0
4,0000455dfa3e01eae3af,can i convert montra helicon d to a mountain b...,0,__label__0


In [15]:
train_df, test_df = train_test_split(clean_df, random_state=50, stratify=clean_df['label'], test_size=0.1)

In [16]:
train_df.shape

(1175509, 4)

In [17]:
test_df.shape

(130613, 4)

In [18]:
# train_df.to_csv()
# test_df.to_csv()

In [19]:
# analysis

In [20]:
# convert csv to fwf format

In [21]:
config={
    'data_folder':'/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/',
    'training_csv_file':'train.csv',
    'training_fwf_file':'train_fwf.train',
    'model_folder': r'/Users/shishirkumar/jupyter_notebook/NLP/pretrained_models/',
    'pretrained_model': 'wiki-news-300d-1M.vec',
    'model_version': 'm1'
}


In [22]:
def to_fwf(df, fname):
    content = tabulate(df.values.tolist(), list(df.columns), tablefmt="plain")
    content = content[(content.find('\n') + 1):]
    open(fname, "w").write(content)
pd.DataFrame.to_fwf = to_fwf

In [23]:
train_df=train_df[['label','question_text']]

In [24]:
train_df.head()

Unnamed: 0,label,question_text
1278191,__label__0,what are the two electrodes in a conductivity ...
916255,__label__0,how do i figure out my marriage
1144328,__label__0,how do web developers collaborate between othe...
12107,__label__0,what rock tirumala venkateswara temple is built
431956,__label__0,is knowing too many people a good thing or a b...


In [25]:
config['data_folder']+config['training_fwf_file']

'/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/train_fwf.train'

In [26]:
train_df.to_fwf(config['data_folder']+config['training_fwf_file'])

In [27]:
train_fn = os.path.join(config['data_folder'], config['training_fwf_file'])

In [28]:
pretrainedvec_fn = os.path.join(config['model_folder'], config['pretrained_model'])
model_fn = os.path.join(config['data_folder'], 'models','{}_{}.bin'.format('qoura_question',config['model_version']))

In [29]:
print('train_fn:{} \n model_fn:{} \n pretrainedvec_fn:{}'.format(train_fn,model_fn,pretrainedvec_fn))

train_fn:/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/train_fwf.train 
 model_fn:/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/models/qoura_question_m1.bin 
 pretrainedvec_fn:/Users/shishirkumar/jupyter_notebook/NLP/pretrained_models/wiki-news-300d-1M.vec


In [30]:
%%time
model = fasttext.train_supervised(input=train_fn,
                                  pretrainedVectors=pretrainedvec_fn,
                                  dim=300, 
                                  wordNgrams=2, 
                                  minCount=3, 
                                  epoch=20)

CPU times: user 10min 31s, sys: 1min 38s, total: 12min 10s
Wall time: 9min 39s


In [31]:
model_fn

'/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/models/qoura_question_m1.bin'

In [32]:
model.save_model(model_fn)

In [33]:
def get_proba(x):
    if x[0][0] == '__label__0':
        return 1 - x[1][0] if x[1][0]<= 1 else 0
    elif x[0][0] == '__label__1':
        return x[1][0] if x[1][0]<= 1 else 1

train_df['threshold'] = train_df['question_text'].apply(lambda x: get_proba(model.predict(x,k=2)))
test_df['threshold'] = test_df['question_text'].apply(lambda x: get_proba(model.predict(x,k=2)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [34]:
train_df['predict'] = train_df['question_text'].apply(lambda x: model.predict(x,k=1)[0][0])
train_df['predict'] = train_df['predict'].apply(lambda x: int(x.replace('__label__','')))

train_df['target'] = train_df['label'].apply(lambda x: int(x.replace('__label__','')))

train_df['predict_proba'] = train_df['question_text'].apply(lambda x: model.predict(x,k=1)[1][0])

In [35]:
train_df.head()

Unnamed: 0,label,question_text,threshold,predict,target,predict_proba
1278191,__label__0,what are the two electrodes in a conductivity ...,0.0,0,0,1.00001
916255,__label__0,how do i figure out my marriage,0.0,0,0,1.00001
1144328,__label__0,how do web developers collaborate between othe...,0.0,0,0,1.00001
12107,__label__0,what rock tirumala venkateswara temple is built,0.0,0,0,1.00001
431956,__label__0,is knowing too many people a good thing or a b...,9.5e-05,0,0,0.999905


In [36]:
train_df[train_df['label']=='__label__1'].head()

Unnamed: 0,label,question_text,threshold,predict,target,predict_proba
110399,__label__1,i am 13 and in love with a gay 30 year old ent...,0.982347,1,1,0.982347
489651,__label__1,what do white women who have black biracial ch...,0.998977,1,1,0.998977
932411,__label__1,if your direct superiors are cunts and everyon...,1.0,1,1,1.000004
543656,__label__1,why do indonesian bureaucratics pretend as if ...,0.999535,1,1,0.999535
614054,__label__1,do you honestly feel bad that fadi al batsh is...,0.908907,1,1,0.908907


In [37]:
# from sklearn.metrics import confusion_matrix
confusion_matrix(train_df['target'],train_df['predict'])

array([[1102577,     203],
       [    305,   72424]])

In [38]:
print(classification_report(train_df['target'],train_df['predict']))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1102780
           1       1.00      1.00      1.00     72729

    accuracy                           1.00   1175509
   macro avg       1.00      1.00      1.00   1175509
weighted avg       1.00      1.00      1.00   1175509



In [39]:
test_df['predict'] = test_df['question_text'].apply(lambda x: model.predict(x,k=1)[0][0])
test_df['predict'] = test_df['predict'].apply(lambda x: int(x.replace('__label__','')))
test_df['predict_proba'] = test_df['question_text'].apply(lambda x: model.predict(x,k=1)[1][0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [40]:
test_df['question_text'].apply(lambda x: model.predict(x,k=2))

924580    ((__label__0, __label__1), [1.0000085830688477...
296391    ((__label__0, __label__1), [0.5619279742240906...
91343     ((__label__0, __label__1), [1.0000100135803223...
458059    ((__label__0, __label__1), [1.0000100135803223...
418878    ((__label__0, __label__1), [1.0000079870224, 1...
                                ...                        
749454    ((__label__1, __label__0), [0.8427378535270691...
852935    ((__label__0, __label__1), [1.0000100135803223...
739588    ((__label__1, __label__0), [0.5999968647956848...
518169    ((__label__0, __label__1), [0.9996997117996216...
511456    ((__label__0, __label__1), [1.0000100135803223...
Name: question_text, Length: 130613, dtype: object

In [41]:
test_df.head()

Unnamed: 0,qid,question_text,target,label,threshold,predict,predict_proba
924580,b5319e7bb7604c95d91b,how do i deal with the concept of keeping face...,0,__label__0,0.0,0,1.000009
296391,3a078ec81646dd3d5e42,my favorite pasttime is picking up girls list...,1,__label__1,0.438072,0,0.561928
91343,11e50b883166438d0073,would japan participate in a second korean war,0,__label__0,0.0,0,1.00001
458059,59b8701b429384850ecc,did not both religions stem from babylon,0,__label__0,0.0,0,1.00001
418878,521776d1936d316af387,what would have happened to india if sardar pa...,0,__label__0,0.0,0,1.000008


In [42]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_df['target'],test_df['predict'])

array([[119852,   2680],
       [  3907,   4174]])

In [43]:
from sklearn.metrics import classification_report
print(classification_report(test_df['target'],test_df['predict']))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97    122532
           1       0.61      0.52      0.56      8081

    accuracy                           0.95    130613
   macro avg       0.79      0.75      0.77    130613
weighted avg       0.95      0.95      0.95    130613



In [44]:
test_df[(test_df['target']==0) & (test_df['predict']==1)]

Unnamed: 0,qid,question_text,target,label,threshold,predict,predict_proba
1216883,ee7e9740af314518a13e,what did japanese soldiers think about black a...,0,__label__0,0.568897,1,0.568897
32990,0676086e8676e60b736c,so who among all this sites is the real deal a...,0,__label__0,0.853455,1,0.853455
451911,5883dc89682b39cbe7f9,what is the lastest innovation that elon musk ...,0,__label__0,0.507689,1,0.507689
952764,bab087a865acf5794b49,despite being under muslim rule for centuries ...,0,__label__0,0.998692,1,0.998692
12805,028590327d4d45fd2e99,why india s left leaning media supports pakist...,0,__label__0,0.999990,1,0.999990
...,...,...,...,...,...,...,...
1058037,cf522500d60d56ccd55c,why can i feel comfortable with enjoying the a...,0,__label__0,0.803214,1,0.803214
799380,9ca285a88333184301c2,why did china occupy tibet why did they occup...,0,__label__0,0.999943,1,0.999943
499618,61d04b186bf5751047d8,how do non punjabis in india feel about overre...,0,__label__0,0.986989,1,0.986989
1228357,f0b8881ac41c41bcc2d7,can a botched circumcision surgery be remedied...,0,__label__0,0.853060,1,0.853060


In [45]:
model.predict('how to get admission?',k=2)

(('__label__0', '__label__1'), array([1.00001001e+00, 1.00000034e-05]))

In [46]:
model.predict('why did men always think about sex',k=2)

(('__label__0', '__label__1'), array([9.99358594e-01, 6.61406666e-04]))

In [47]:
model.predict('why did china occupy tibet',k=2)

(('__label__0', '__label__1'), array([1.00000989e+00, 1.00648331e-05]))

In [48]:
# threshold optimization

In [49]:
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# confusion = confusion_matrix(test_df['target'],test_df['predict'])

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

def threshold_optimization(y,num):
    #pass the prediction_probability series and threshold probabilty cutoff
    y_bool= y.apply(lambda x: 1 if x>=num else 0)
    return y_bool

num = np.arange(0, 1, 0.02)
for i in num:
    cm1 = confusion_matrix(test_df['target'],threshold_optimization(test_df['threshold'],i))
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df.head())

      prob  accuracy     sensi     speci
0.00  0.00  0.061870  1.000000  0.000000
0.02  0.02  0.932847  0.710679  0.947499
0.04  0.04  0.937801  0.677639  0.954959
0.06  0.06  0.940389  0.656231  0.959129
0.08  0.08  0.942088  0.639649  0.962034


In [50]:
print(cutoff_df.head(10))

      prob  accuracy     sensi     speci
0.00  0.00  0.061870  1.000000  0.000000
0.02  0.02  0.932847  0.710679  0.947499
0.04  0.04  0.937801  0.677639  0.954959
0.06  0.06  0.940389  0.656231  0.959129
0.08  0.08  0.942088  0.639649  0.962034
0.10  0.10  0.943214  0.627769  0.964018
0.12  0.12  0.943964  0.617003  0.965527
0.14  0.14  0.944776  0.608217  0.966972
0.16  0.16  0.945457  0.600668  0.968196
0.18  0.18  0.946108  0.593862  0.969339


In [51]:
cutoff_df.tail()

Unnamed: 0,prob,accuracy,sensi,speci
0.9,0.9,0.951138,0.401683,0.987375
0.92,0.92,0.95087,0.388318,0.98797
0.94,0.94,0.950732,0.372231,0.988885
0.96,0.96,0.950411,0.348843,0.990084
0.98,0.98,0.949722,0.315555,0.991545


In [52]:
import matplotlib.pyplot as plt
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

<Figure size 640x480 with 1 Axes>

In [53]:
train_df['predict_threshold'] = train_df['threshold'].apply(lambda x: 0 if x <=0.02 else 1 )
test_df['predict_threshold'] = test_df['threshold'].apply(lambda x: 0 if x <=0.02 else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [54]:
confusion_matrix(train_df['target'],train_df['predict_threshold'])

array([[1084847,   17933],
       [     48,   72681]])

In [55]:
print(classification_report(train_df['target'],train_df['predict_threshold']))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1102780
           1       0.80      1.00      0.89     72729

    accuracy                           0.98   1175509
   macro avg       0.90      0.99      0.94   1175509
weighted avg       0.99      0.98      0.99   1175509



In [56]:
print(classification_report(test_df['target'],test_df['predict_threshold']))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96    122532
           1       0.47      0.71      0.57      8081

    accuracy                           0.93    130613
   macro avg       0.73      0.83      0.77    130613
weighted avg       0.95      0.93      0.94    130613



In [57]:
# f1_score(y_test_label, y_pred_temp)

In [58]:
# from sklearn.preprocessing import Binarizer
# from sklearn.metrics import f1_score
# for label in [1,0]:
#     y_test_label = train_df['target']
#     thresholds = np.arange(0.02, 1, 0.02)
#     t_opt = 0
#     score_max = 0
#     for t in thresholds:
#         y_pred_temp = Binarizer(t).fit_transform(train_df[label].values.reshape(-1,1)) 
#         score = f1_score(y_test_label, y_pred_temp)  # f1 score
#         if (score > score_max) & (confusion_matrix(y_test_label, y_pred_temp)[0, 0] > 0) & (confusion_matrix(y_test_label, y_pred_temp)[1, 1] > 0):
#             score_max = score
#             t_opt = t
#     print("Class: ", label, "\t Threshold: ", t_opt)

In [59]:
# import numpy as np
# np.arange(0.02, 1, 0.02)