# Import packages

In [39]:
import pandas as pd
import numpy as np
import os

# fastext
import fasttext

from text_cleaning_utils import TextCleaningUtils
from sklearn.model_selection import train_test_split
from fast_text import FastTextUtils

from tabulate import tabulate

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('./data/train.csv')

In [3]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
df[df['target'] == 1].head()

Unnamed: 0,qid,question_text,target
22,0000e91571b60c2fb487,Has the United States become the largest dicta...,1
30,00013ceca3f624b09f42,Which babies are more sweeter to their parents...,1
110,0004a7fcb2bf73076489,If blacks support school choice and mandatory ...,1
114,00052793eaa287aff1e1,I am gay boy and I love my cousin (boy). He is...,1
115,000537213b01fd77b58a,Which races have the smallest penis?,1


In [5]:
df.shape

(1306122, 3)

# Text Cleaning

In [6]:
def clean_data(df,col_to_clean):
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.replace_contractions)
#   Remove Smiles and special chars
#   df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.transform_emojis)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_special_chars)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_spaces)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_punctuations)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_exaggerated_words)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_newlines)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_twitter_handles)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_web_links)
#   df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.replace_sign)
  df[col_to_clean] = df[col_to_clean].astype(str)
  df[col_to_clean] = df[col_to_clean].str.lower()
  return df

In [7]:
clean_df = clean_data(df,'question_text')

In [8]:
clean_df.to_csv('./data/train_clean.csv',index=False,encoding='utf-8')

In [9]:
clean_df.shape

(1306122, 3)

In [10]:
clean_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,how did quebec nationalists see their province...,0
1,000032939017120e6e44,do you have an adopted dog how would you enco...,0
2,0000412ca6e4628ce2cf,why does velocity affect time does velocity a...,0
3,000042bf85aa498cd78e,how did otto von guericke used the magdeburg h...,0
4,0000455dfa3e01eae3af,can i convert montra helicon d to a mountain b...,0


In [11]:
clean_df[clean_df['target']== 1].head()

Unnamed: 0,qid,question_text,target
22,0000e91571b60c2fb487,has the united states become the largest dicta...,1
30,00013ceca3f624b09f42,which babies are more sweeter to their parents...,1
110,0004a7fcb2bf73076489,if blacks support school choice and mandatory ...,1
114,00052793eaa287aff1e1,i am gay boy and i love my cousin boy he is s...,1
115,000537213b01fd77b58a,which races have the smallest penis,1


In [12]:
map_={0:'__label__0',
     1:'__label__1'}
clean_df['label'] = clean_df['target'].map(map_)

In [13]:
# train test split

In [16]:
clean_df.head()

Unnamed: 0,qid,question_text,target,label
0,00002165364db923c7e6,how did quebec nationalists see their province...,0,__label__0
1,000032939017120e6e44,do you have an adopted dog how would you enco...,0,__label__0
2,0000412ca6e4628ce2cf,why does velocity affect time does velocity a...,0,__label__0
3,000042bf85aa498cd78e,how did otto von guericke used the magdeburg h...,0,__label__0
4,0000455dfa3e01eae3af,can i convert montra helicon d to a mountain b...,0,__label__0


In [17]:
train_df, test_df = train_test_split(clean_df, random_state=50, stratify=clean_df['label'], test_size=0.1)

In [18]:
train_df.shape

(1175509, 4)

In [19]:
test_df.shape

(130613, 4)

In [20]:
# train_df.to_csv()
# test_df.to_csv()

In [21]:
# analysis

In [22]:
# convert csv to fwf format

In [23]:
config={
    'data_folder':'/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/',
    'training_csv_file':'train.csv',
    'training_fwf_file':'train_fwf.train',
    'model_folder': r'/Users/shishirkumar/jupyter_notebook/NLP/pretrained_models/',
    'pretrained_model': 'wiki-news-300d-1M.vec',
    'model_version': 'm1'
}


In [24]:
def to_fwf(df, fname):
    content = tabulate(df.values.tolist(), list(df.columns), tablefmt="plain")
    content = content[(content.find('\n') + 1):]
    open(fname, "w").write(content)
pd.DataFrame.to_fwf = to_fwf

In [25]:
train_df=train_df[['label','question_text']]

In [26]:
train_df.head()

Unnamed: 0,label,question_text
1278191,__label__0,what are the two electrodes in a conductivity ...
916255,__label__0,how do i figure out my marriage
1144328,__label__0,how do web developers collaborate between othe...
12107,__label__0,what rock tirumala venkateswara temple is built
431956,__label__0,is knowing too many people a good thing or a b...


In [27]:
config['data_folder']+config['training_fwf_file']

'/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/train_fwf.train'

In [28]:
train_df.to_fwf(config['data_folder']+config['training_fwf_file'])

In [29]:
train_fn = os.path.join(config['data_folder'], config['training_fwf_file'])

In [30]:
pretrainedvec_fn = os.path.join(config['model_folder'], config['pretrained_model'])
model_fn = os.path.join(config['data_folder'], 'models','{}_{}.bin'.format('qoura_question',config['model_version']))

In [31]:
print('train_fn:{} \n model_fn:{} \n pretrainedvec_fn:{}'.format(train_fn,model_fn,pretrainedvec_fn))

train_fn:/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/train_fwf.train 
 model_fn:/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/models/qoura_question_m1.bin 
 pretrainedvec_fn:/Users/shishirkumar/jupyter_notebook/NLP/pretrained_models/wiki-news-300d-1M.vec


In [32]:
%%time
model = fasttext.train_supervised(input=train_fn,
                                  pretrainedVectors=pretrainedvec_fn,
                                  dim=300, 
                                  wordNgrams=2, 
                                  minCount=3, 
                                  epoch=20)

CPU times: user 10min 18s, sys: 45.8 s, total: 11min 4s
Wall time: 8min 10s


In [33]:
model_fn

'/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/models/qoura_question_m1.bin'

In [34]:
model.save_model(model_fn)

In [35]:
train_df['predict'] = train_df['question_text'].apply(lambda x: model.predict(x,k=1)[0][0])
train_df['predict'] = train_df['predict'].apply(lambda x: int(x.replace('__label__','')))

train_df['target'] = train_df['label'].apply(lambda x: int(x.replace('__label__','')))

train_df['predict_proba'] = train_df['question_text'].apply(lambda x: model.predict(x,k=1)[1][0])

In [36]:
train_df.head()

Unnamed: 0,label,question_text,predict,target,predict_proba
1278191,__label__0,what are the two electrodes in a conductivity ...,0,0,1.00001
916255,__label__0,how do i figure out my marriage,0,0,1.00001
1144328,__label__0,how do web developers collaborate between othe...,0,0,1.00001
12107,__label__0,what rock tirumala venkateswara temple is built,0,0,1.00001
431956,__label__0,is knowing too many people a good thing or a b...,0,0,1.00001


In [37]:
train_df[train_df['label']=='__label__1'].head()

Unnamed: 0,label,question_text,predict,target,predict_proba
110399,__label__1,i am 13 and in love with a gay 30 year old ent...,1,1,0.979046
489651,__label__1,what do white women who have black biracial ch...,1,1,0.998841
932411,__label__1,if your direct superiors are cunts and everyon...,1,1,1.00001
543656,__label__1,why do indonesian bureaucratics pretend as if ...,1,1,1.00001
614054,__label__1,do you honestly feel bad that fadi al batsh is...,1,1,0.957922


In [40]:
# from sklearn.metrics import confusion_matrix
confusion_matrix(train_df['target'],train_df['predict'])

array([[1102580,     200],
       [    326,   72403]])

In [41]:
print(classification_report(train_df['target'],train_df['predict']))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1102780
           1       1.00      1.00      1.00     72729

    accuracy                           1.00   1175509
   macro avg       1.00      1.00      1.00   1175509
weighted avg       1.00      1.00      1.00   1175509



In [42]:
test_df['predict'] = test_df['question_text'].apply(lambda x: model.predict(x,k=1)[0][0])
test_df['predict'] = test_df['predict'].apply(lambda x: int(x.replace('__label__','')))
test_df['predict_proba'] = test_df['question_text'].apply(lambda x: model.predict(x,k=1)[1][0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [43]:
test_df.head()

Unnamed: 0,qid,question_text,target,label,predict,predict_proba
924580,b5319e7bb7604c95d91b,how do i deal with the concept of keeping face...,0,__label__0,0,1.00001
296391,3a078ec81646dd3d5e42,my favorite pasttime is picking up girls list...,1,__label__1,0,0.808077
91343,11e50b883166438d0073,would japan participate in a second korean war,0,__label__0,0,1.00001
458059,59b8701b429384850ecc,did not both religions stem from babylon,0,__label__0,0,1.00001
418878,521776d1936d316af387,what would have happened to india if sardar pa...,0,__label__0,0,1.00001


In [44]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_df['target'],test_df['predict'])

array([[119836,   2696],
       [  3919,   4162]])

In [45]:
from sklearn.metrics import classification_report
print(classification_report(test_df['target'],test_df['predict']))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97    122532
           1       0.61      0.52      0.56      8081

    accuracy                           0.95    130613
   macro avg       0.79      0.75      0.77    130613
weighted avg       0.95      0.95      0.95    130613



In [None]:
# threshold optimization

In [None]:
# cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
# from sklearn.metrics import confusion_matrix

# confusion=confusion_matrix(test_df['target'],test_df['predict'])

# # TP = confusion[1,1] # true positive 
# # TN = confusion[0,0] # true negatives
# # FP = confusion[0,1] # false positives
# # FN = confusion[1,0] # false negatives

# def threshold_optimization(y,num):
#     #pass the prediction_probability series and threshold probabilty cutoff
#     y_bool= y.apply(lambda x: 1 if x>=num else 0)
#     return y_bool

# num = np.arange(0.8, 1, 0.002)
# for i in num:
#     cm1 = confusion_matrix(train_df['target'],threshold_optimization(train_df['predict_proba'],i))
#     total1=sum(sum(cm1))
#     accuracy = (cm1[0,0]+cm1[1,1])/total1
    
#     speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
#     sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
#     cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
# print(cutoff_df.head())

In [None]:
# cutoff_df.tail()

In [None]:
# import matplotlib.pyplot as plt
# cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
# plt.show()

In [None]:
# f1_score(y_test_label, y_pred_temp)

In [None]:
# from sklearn.preprocessing import Binarizer
# from sklearn.metrics import f1_score
# for label in [1,0]:
#     y_test_label = train_df['target']
#     thresholds = np.arange(0.02, 1, 0.02)
#     t_opt = 0
#     score_max = 0
#     for t in thresholds:
#         y_pred_temp = Binarizer(t).fit_transform(train_df[label].values.reshape(-1,1)) 
#         score = f1_score(y_test_label, y_pred_temp)  # f1 score
#         if (score > score_max) & (confusion_matrix(y_test_label, y_pred_temp)[0, 0] > 0) & (confusion_matrix(y_test_label, y_pred_temp)[1, 1] > 0):
#             score_max = score
#             t_opt = t
#     print("Class: ", label, "\t Threshold: ", t_opt)

In [None]:
# import numpy as np
# np.arange(0.02, 1, 0.02)