# Import packages

In [57]:
import pandas as pd
import os

# fastext
import fasttext

from text_cleaning_utils import TextCleaningUtils
from sklearn.model_selection import train_test_split
from fast_text import FastTextUtils

from tabulate import tabulate

In [58]:
df = pd.read_csv('./data/train.csv')

In [59]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [60]:
df[df['target'] == 1].head()

Unnamed: 0,qid,question_text,target
22,0000e91571b60c2fb487,Has the United States become the largest dicta...,1
30,00013ceca3f624b09f42,Which babies are more sweeter to their parents...,1
110,0004a7fcb2bf73076489,If blacks support school choice and mandatory ...,1
114,00052793eaa287aff1e1,I am gay boy and I love my cousin (boy). He is...,1
115,000537213b01fd77b58a,Which races have the smallest penis?,1


In [61]:
df.shape

(1306122, 3)

# Text Cleaning

In [40]:
def clean_data(df,col_to_clean):
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.replace_contractions)
#   Remove Smiles and special chars
#   df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.transform_emojis)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_special_chars)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_spaces)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_punctuations)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_exaggerated_words)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_newlines)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_twitter_handles)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_web_links)
#   df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.replace_sign)
  df[col_to_clean] = df[col_to_clean].astype(str)
  df[col_to_clean] = df[col_to_clean].str.lower()
  return df

In [10]:
clean_df = clean_data(df,'question_text')

In [13]:
clean_df.to_csv('./data/train_clean.csv',index=False,encoding='utf-8')

In [11]:
clean_df.shape

(1306122, 3)

In [54]:
clean_df.head()

Unnamed: 0,qid,question_text,target,label
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,__label__0
1,000032939017120e6e44,Do you have an adopted dog how would you enco...,0,__label__0
2,0000412ca6e4628ce2cf,Why does velocity affect time Does velocity a...,0,__label__0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,__label__0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,__label__0


In [56]:
clean_df[clean_df['target']== 1].head()

Unnamed: 0,qid,question_text,target,label
22,0000e91571b60c2fb487,Has the United States become the largest dicta...,1,__label__1
30,00013ceca3f624b09f42,Which babies are more sweeter to their parents...,1,__label__1
110,0004a7fcb2bf73076489,If blacks support school choice and mandatory ...,1,__label__1
114,00052793eaa287aff1e1,I am gay boy and I love my cousin boy He is s...,1,__label__1
115,000537213b01fd77b58a,Which races have the smallest penis,1,__label__1


In [53]:
map_={0:'__label__0',
     1:'__label__1'}
clean_df['label'] = clean_df['target'].map(map_)

In [13]:
# train test split

In [75]:
train_df, test_df = train_test_split(clean_df, random_state=50, stratify='label', test_size=0.1)

In [76]:
train_df.shape

(1110203, 4)

In [77]:
test_df.shape

(195919, 4)

In [78]:
# train_df.to_csv()
# test_df.to_csv()

In [79]:
# analysis

In [81]:
# convert csv to fwf format

In [82]:
config={
    'data_folder':'/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/',
    'training_csv_file':'train.csv',
    'training_fwf_file':'train_fwf.train',
    'model_folder': r'/Users/shishirkumar/jupyter_notebook/NLP/pretrained_models/',
    'pretrained_model': 'wiki-news-300d-1M.vec',
    'model_version': 'm1'
}


In [83]:
def to_fwf(df, fname):
    content = tabulate(df.values.tolist(), list(df.columns), tablefmt="plain")
    content = content[(content.find('\n') + 1):]
    open(fname, "w").write(content)
pd.DataFrame.to_fwf = to_fwf

In [84]:
train_df=train_df[['label','question_text']]

In [85]:
train_df.head()

Unnamed: 0,label,question_text
1075204,__label__0,What were the major accomplishments by Larry F...
407385,__label__0,Why should have Hitler waited to start the war
944423,__label__0,How does blood provide oxygen to the body
271094,__label__0,Are the Microsoft Virtual Academy certificates...
1156712,__label__0,Why I always postpone important work to anothe...


In [86]:
config['data_folder']+config['training_fwf_file']

'/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/train_fwf.train'

In [87]:
train_df.to_fwf(config['data_folder']+config['training_fwf_file'])

In [88]:
train_fn = os.path.join(config['data_folder'], config['training_fwf_file'])

In [89]:
pretrainedvec_fn = os.path.join(config['model_folder'], config['pretrained_model'])
model_fn = os.path.join(config['data_folder'], 'models','{}_{}.bin'.format('qoura_question',config['model_version']))

In [90]:
print('train_fn:{} \n model_fn:{} \n pretrainedvec_fn:{}'.format(train_fn,model_fn,pretrainedvec_fn))

train_fn:/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/train_fwf.train 
 model_fn:/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/models/qoura_question_m1.bin 
 pretrainedvec_fn:/Users/shishirkumar/jupyter_notebook/NLP/pretrained_models/wiki-news-300d-1M.vec


In [91]:
%%time
model = fasttext.train_supervised(input=train_fn,
                                  pretrainedVectors=pretrainedvec_fn,
                                  dim=300, # dont change
                                  wordNgrams=2, # dont change
                                  minCount=3, #frequency dont change
                                  epoch=20,
                                  loss='ova')

CPU times: user 9min 23s, sys: 26.4 s, total: 9min 50s
Wall time: 6min 11s


In [92]:
model_fn

'/Users/shishirkumar/shishirkmr/Git/data_science/Quora Insincere Questions Classification/Quora-Insincere-Questions-Classification/data/models/qoura_question_m1.bin'

In [94]:
model.save_model(model_fn)

In [110]:
train_df['predict'] = train_df['question_text'].apply(lambda x: model.predict(x,k=1)[0][0])
train_df['predict'] = train_df['predict'].apply(lambda x: int(x.replace('__label__','')))

train_df['target'] = train_df['label'].apply(lambda x: int(x.replace('__label__','')))

train_df['predict_proba'] = train_df['question_text'].apply(lambda x: model.predict(x,k=1)[1][0])

In [111]:
train_df.head()

Unnamed: 0,label,question_text,predict,predict_proba,target
1075204,__label__0,What were the major accomplishments by Larry F...,0,1.00001,0
407385,__label__0,Why should have Hitler waited to start the war,0,1.00001,0
944423,__label__0,How does blood provide oxygen to the body,0,1.00001,0
271094,__label__0,Are the Microsoft Virtual Academy certificates...,0,1.00001,0
1156712,__label__0,Why I always postpone important work to anothe...,0,1.00001,0


In [112]:
# from sklearn.metrics import confusion_matrix
confusion_matrix(train_df['target'],train_df['predict'])

array([[1041482,     113],
       [    162,   68446]])

In [113]:
test_df['predict'] = test_df['question_text'].apply(lambda x: model.predict(x,k=1)[0][0])
test_df['predict'] = test_df['predict'].apply(lambda x: int(x.replace('__label__','')))
test_df['predict_proba'] = test_df['question_text'].apply(lambda x: model.predict(x,k=1)[1][0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [114]:
test_df.head()

Unnamed: 0,qid,question_text,target,label,result,predict,predict_proba
793006,9b62f614df89ffd929e7,What is the best website to browse and order b...,0,__label__0,{'__label__0': 1.0000100135803223},0,1.00001
967175,bd7d802f7d005efb9546,Is it possible for an Android Xiaomi phone to ...,0,__label__0,{'__label__0': 1.0000100135803223},0,1.00001
133728,1a2bdd15d1c9b1ea7af1,Where can I get my Moto E4 plus rooted in Mumbai,0,__label__0,{'__label__0': 1.0000100135803223},0,1.00001
981324,c03dd81b3e6f0d806649,How do you deal with someone who is making you...,0,__label__0,{'__label__0': 0.9991540908813477},0,0.999154
223722,2bc04bcabae6e449822c,Does Tinder work in Aurangabad,0,__label__0,{'__label__0': 1.0000100135803223},0,1.00001


In [115]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_df['target'],test_df['predict'])

array([[179867,   3850],
       [  5952,   6250]])