In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip

In [None]:
!ls /kaggle/working/

In [None]:
import seaborn as sns

import fasttext

In [None]:
train_df = pd.read_csv('/kaggle/working/train.csv')
test_df = pd.read_csv('/kaggle/working/test.csv')
sample_submission_df = pd.read_csv('/kaggle/working/sample_submission.csv')
test_labels_df = pd.read_csv('/kaggle/working/test_labels.csv')

In [None]:
train_df.head()

In [None]:
sample_submission_df.head()

In [None]:
test_df.head()

In [None]:
sns.heatmap(train_df.corr(),cmap='YlGnBu',annot=True)

In [None]:
# Text Cleaning

In [None]:
import re
import logging
import itertools
import unicodedata
# import contractions

from bs4 import BeautifulSoup

class TextCleaningUtils:
    '''
        This class contains implementations of various text cleaning operations (Static Methods)
    '''


    cleaning_regex_map = {
        'web_links': r'(?i)(?:(?:http(?:s)?:)|(?:www\.))\S+',
        'special_chars': r'[^a-zA-Z0-9\s\.,!?;:]+',
        'redundant_spaces': r'\s\s+',
        'redundant_newlines': r'[\r|\n|\r\n]+',
        'twitter_handles': r'[#@]\S+',
        'punctuations': r'[\.,!?;:]+'
    }

    @staticmethod
    def clean_text_from_regex(text, text_clean_regex):
        '''
            Follow a particular cleaning expression, provided
            as an input by an user to clean the text.
        '''

        text = text_clean_regex.sub(' ', text).strip()
        return text

    @staticmethod
    def strip_html(text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    @staticmethod
    def remove_special_chars(text):
        '''
            Replace any special character provided as default,
            which is present in the text with space
        '''

        special_chars_regex = re.compile(TextCleaningUtils.cleaning_regex_map['special_chars'])
        text = TextCleaningUtils.clean_text_from_regex(text, special_chars_regex)
        return text

    @staticmethod
    def remove_redundant_spaces(text):
        '''
            Remove any redundant space provided as default,
            that is present in the text.
        '''

        redundant_spaces_regex = re.compile(
            TextCleaningUtils.cleaning_regex_map['redundant_spaces'])
        text = TextCleaningUtils.clean_text_from_regex(text, redundant_spaces_regex)
        return text

    @staticmethod
    def remove_web_links(text):
        '''
            Removes any web link that follows a particular default expression,
            present in the text.
        '''

        web_links_regex = re.compile(TextCleaningUtils.cleaning_regex_map['web_links'])
        text = TextCleaningUtils.clean_text_from_regex(text, web_links_regex)
        return text

    @staticmethod
    def remove_twitter_handles(text):
        '''
            Removes any twitter handle present in the text.
        '''

        twitter_handles_regex = re.compile(TextCleaningUtils.cleaning_regex_map['twitter_handles'])
        text = TextCleaningUtils.clean_text_from_regex(text, twitter_handles_regex)
        return text

    @staticmethod
    def remove_redundant_newlines(text):
        '''
            Removes any redundant new line present in the text.
        '''

        redundant_newlines_regex = re.compile(
            TextCleaningUtils.cleaning_regex_map['redundant_newlines'])
        text = TextCleaningUtils.clean_text_from_regex(text, redundant_newlines_regex)
        return text

    @staticmethod
    def remove_punctuations(text):
        '''
            Removes any punctuation that follows the default expression, in the text.
        '''

        remove_punctuations_regex = re.compile(TextCleaningUtils.cleaning_regex_map['punctuations'])
        text = TextCleaningUtils.clean_text_from_regex(text, remove_punctuations_regex)
        return text

    @staticmethod
    def remove_exaggerated_words(text):
        '''
            Removes any exaggerated word present in the text.
        '''

        return ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

    @staticmethod
    def replace_multiple_chars(text):
        '''
            Replaces multiple characters present in the text.
        '''

        char_list = ['.', '?', '!', '#', '$', '/', '@', '*', '(', ')', '+']
        final_text = ''
        for i in char_list:
            if i in text:
                pattern = "\\" + i + '{2,}'
                repl_str = i.replace("\\", "")
                text = re.sub(pattern, repl_str, text)
                final_text = ' '.join(text.split())
        return final_text

    @staticmethod
    def replace_sign(text):
        '''
            Replaces any sign with words like & with 'and', in the text.
        '''
        sign_list = {'&': ' and ', '/': ' or ', '\xa0': ' '}
        final_text = ''
        for i in sign_list:
            if i in text:
                text = re.sub(i, sign_list[i], text)
                final_text = ' '.join(text.split())
        return final_text

    @staticmethod
    def remove_accented_char(text):
        text = unicodedata.normalize('NFD', text) \
            .encode('ascii', 'ignore') \
            .decode("utf-8")
        return str(text)

    @staticmethod
    def replace_characters(text, replace_map):
        '''
            Replaces any character custom provided by an user.
        '''

        for char, replace_val in replace_map.items():
            text = text.replace(char, replace_val)
        return text

In [None]:
def clean_data(df,col_to_clean):
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_special_chars)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_spaces)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_punctuations)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_exaggerated_words)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_newlines)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_twitter_handles)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_web_links)
  df[col_to_clean] = df[col_to_clean].astype(str)
  df[col_to_clean] = df[col_to_clean].str.lower()
  return df

In [None]:
train_df = clean_data(train_df,'comment_text')

In [None]:
test_df = clean_data(test_df,'comment_text')

In [None]:
def list_labels(row):
    label_list = []
    if row['toxic'] == 1:
        label_list.append('toxic')
    if row['severe_toxic'] == 1:
        label_list.append('severe_toxic')
    if row['obscene'] == 1:
        label_list.append('obscene')
    if row['threat'] == 1:
        label_list.append('threat')
    if row['insult'] == 1:
        label_list.append('insult')
    if row['identity_hate'] == 1:
        label_list.append('identity_hate')
    return label_list
train_df['labels'] = train_df.apply(lambda x:list_labels(x) , axis=1)

In [None]:
# fwf converter

In [None]:
import logging
import pandas as pd
from ast import literal_eval
from tabulate import tabulate
import re


class FwfConvertor:

    def __init__(self):
        pass

    @staticmethod
    def save_fwf_tr_data(data, text_col, labels_col, label_mapping=None):
        if label_mapping is not None:
            data[labels_col] = data[labels_col].apply(FwfConvertor.label_map_func, label_mapping=label_mapping)
        # Preprocessing df in format for fwf
        data[labels_col] = data[labels_col].apply(FwfConvertor.preprocess_fwf_func)
        # limiting only two columns in fwf file
        data = data[[labels_col, text_col]]
        # Converting to fwf format
        content = FwfConvertor.to_fwf(data)
        try:
            open('Model Training data.train', 'w').write(content)
            logging.info("FWF file saved")
        except Exception as exp:
            logging.error("File saving failed", exp)

    @staticmethod
    def label_map_func(element, label_mapping):
        return list((tag.replace(tag, label_mapping[tag])) for tag in element)

    @staticmethod
    def preprocess_fwf_func(element):
        return " ".join(element)

    @staticmethod
    def to_fwf(df):
        content = tabulate(list(df.values), list(df.columns), tablefmt="plain")
        content = content[(content.find('\n') + 1):]
        content = re.sub(' +', ' ', content)
        return content

In [None]:
label_mapping = {'toxic': '__label__0',
                     'severe_toxic': '__label__1',
                     'obscene': '__label__2',
                     'threat': '__label__3',
                     'insult': '__label__4',
                     'identity_hate': '__label__5'}

In [None]:
%%time
FwfConvertor.save_fwf_tr_data(train_df, 'comment_text', 'labels', label_mapping=label_mapping)

In [None]:
# download wiki file

!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip

In [None]:
!unzip /kaggle/working/wiki-news-300d-1M.vec.zip

In [None]:
train_fn='/kaggle/working/Model Training data.train'
pretrainedvec_fn = '/kaggle/working/wiki-news-300d-1M.vec'

In [None]:
%%time
model = fasttext.train_supervised(input=train_fn,
                                  pretrainedVectors=pretrainedvec_fn,
                                  dim=300, 
                                  wordNgrams=2, 
                                  minCount=3, 
                                  epoch=20, 
                                  loss='ova')

In [None]:
train_df['debug_threshold'] = train_df['comment_text'].apply(lambda x: model.predict(x,k=-1))

In [None]:
train_df.head()

In [None]:
model.predict("i recommend this article",k=-1)

In [None]:
def get_proba(x,col):
    for label in range(len(x[0])):
        if label_mapping[col] == x[0][label]:
            return x[1][label]

In [None]:
for label in label_mapping.keys():
    train_df[label+'_predict']=train_df['comment_text'].apply(lambda x: get_proba(model.predict(x,k=-1),label))
    sz = train_df[label+'_predict'].size-1
    train_df[label+'_sz'] = train_df[label+'_predict'].rank(method='max').apply(lambda x: 100.0*(x-1)/sz)

In [None]:
for label in label_mapping.keys():
    print(label,round(train_df[train_df[label]==1].shape[0]*100/train_df.shape[0],2),'%')

In [None]:
train_df.columns

In [None]:
train_df.head(10)

In [None]:
train_df.describe()

In [None]:
train_df[train_df['toxic'] == 1].describe()

In [None]:
test_df.head()

In [None]:
train_df[train_df['insult']==0].describe([0.2,0.25,0.5,0.7,0.75,0.8,0.9,0.95,0.99])

In [None]:
train_df[train_df['insult']==1].describe([0.05,0.1,0.2,0.25,0.5,0.7,0.75,0.8,0.9])

In [None]:
# threshold = {
#     'toxic_predict' : 0.970,
#     'severe_toxic_predict' : 0.27,
#     'obscene_predict': 0.69,
#     'threat_predict': 0.16,
#     'insult_predict':0.7,
#     'identity_hate_predict': 0.38
# }

threshold = {
    'toxic_predict' : 0.98,
    'severe_toxic_predict' : 0.34,
    'obscene_predict': 0.8,
    'threat_predict': 0.36,
    'insult_predict':0.76,
    'identity_hate_predict': 0.6
}

In [None]:
def rescale_prob(x,label):
    min = 0
    max = 0.5
    if x < threshold[label]:
        normalized_x = (0.5 * (x - min))/(threshold[label] - min)
    else:
        normalized_x = 0.5 +(0.5 * (x - min))/(1 - min)
    return normalized_x

In [None]:
# x = 0.5
# min = 0
# max = 0.5
# 0.5 + (0.5 * (x - min))/(0.9 - min)

In [None]:
for label in label_mapping.keys():
    train_df[label+'_predict']=train_df['comment_text'].apply(lambda x: get_proba(model.predict(x,k=-1),label))
    train_df[label+'_prob'] = train_df[label+'_predict'].apply(lambda x: rescale_prob(x,label+'_predict'))
    train_df[label+'_pred'] = train_df[label+'_predict'].apply(lambda x: 1 if x >= threshold[label+'_predict'] else 0)

In [None]:
# rescale_prob(0.5,'severe_toxic_predict')

In [None]:
train_df[train_df['toxic']==1][['toxic','toxic_predict','toxic_prob']].head(20)

In [None]:
train_df[train_df['severe_toxic']==1][['severe_toxic','severe_toxic_predict','severe_toxic_prob']].head(20)

In [None]:
train_df[train_df['obscene']==1][['obscene','obscene_predict','obscene_prob']].head(20)

In [None]:
train_df[train_df['toxic']==1][['toxic','toxic_predict','toxic_prob']].head(20)

In [None]:
# x=0.8
# min = 0
# max = 0.5
# (0.5 * (x - min))/(0.9 - min)

In [None]:
# x=0.8
# min = 0.5
# max = 1
# (0.5 * (x - min))/(0.8 - min)

In [None]:
for label in label_mapping.keys():
    test_df[label+'_predict']=test_df['comment_text'].apply(lambda x: get_proba(model.predict(x,k=-1),label))
    test_df[label] = test_df[label+'_predict'].apply(lambda x: rescale_prob(x,label+'_predict'))

In [None]:
test_df[sample_submission_df.columns].to_csv('submission.csv',index=False,encoding='utf-8')
test_df.to_csv('all_cols_submission.csv', index=False,encoding='utf-8')

In [None]:
test_df.head()

In [None]:
test_df.head()

In [None]:
# Confusion Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.metrics import confusion_matrix
def plot_cm(df,human_tag,pred_tag,tag,l1,l2):
  cm = confusion_matrix(df[human_tag], df[pred_tag])
  cm_df = pd.DataFrame(cm)
  plt.figure(figsize=(6,5))
  plt.title(tag)
  sns.heatmap(cm_df,annot=True,cmap="Blues",fmt="d",cbar=False)
  plt.xlabel(l1)
  plt.ylabel(l2)
  plt.show()

In [None]:
for tag in threshold.keys():
  actual_tag=tag.replace('_predict','')
  pred_tag=tag.replace('_predict','')+"_pred"
  plot_cm(train_df,actual_tag,pred_tag,actual_tag+' label - actual vs predict',tag,'Actual')

### Threshold Optimization for single class classification

In [None]:
from sklearn.preprocessing import Binarizer
from sklearn.metrics import f1_score
for label in threshold.keys():
    actual_tag=label.replace('_predict','')
    y_test_label = np.where(train_df[actual_tag]==1, 1, 0)
    thresholds = np.arange(0.02, 1, 0.02)
    t_opt = 0
    score_max = 0
    for t in thresholds:
        y_pred_temp = Binarizer(t).fit_transform(train_df[label].values.reshape(-1,1)) 
        score = f1_score(y_test_label, y_pred_temp)  # f1 score
        if (score > score_max) & (confusion_matrix(y_test_label, y_pred_temp)[0, 0] > 0) & (confusion_matrix(y_test_label, y_pred_temp)[1, 1] > 0):
            score_max = score
            t_opt = t
    print("Class: ", label, "\t Threshold: ", t_opt)

In [None]:
# threshold = {
#     'toxic_predict' : 0.98,
#     'severe_toxic_predict' : 0.34,
#     'obscene_predict': 0.8,
#     'threat_predict': 0.36,
#     'insult_predict':0.76,
#     'identity_hate_predict': 0.6
# }



In [None]:

# train_df[label+'_pred'] = train_df[label+'_predict'].apply(lambda x: 1 if x >= threshold[label+'_predict'] else 0)
# for tag in threshold.keys():
#   actual_tag=tag.replace('_predict','')
#   pred_tag=tag.replace('_predict','')+"_pred"
#   plot_cm(train_df,actual_tag,pred_tag,actual_tag+' label - actual vs predict',tag,'Actual')

In [None]:
# train_df.head()

In [None]:
# def list_actual(x):
#     actual_list = []
#     for label in label_mapping.keys(): 
#         actual_tag = label.replace('_predict','')
#         actual_list.append(x[actual_tag])
#     return actual_list
# train_df['actual_label'] = train_df.apply(list_actual,axis=1)

In [None]:
# train_df.head()

In [None]:
# temp_df = train_df[['toxic_predict','severe_toxic_predict','obscene_predict','threat_predict','insult_predict','identity_hate_predict','actual_label']]

In [None]:
# temp_df = temp_df.head(500)

In [None]:
# temp_df.head(10)

In [None]:
# def get_pred_thres_arr(row,label,t):
#     bin_arr=[]
#     temp_thres=threshold.copy()
#     temp_thres[label]=t
#     for tag in threshold.keys():
#         res=0
#         if row[tag]>=temp_thres[tag]:
#             res=1
#         bin_arr.append(res)
#     return bin_arr

In [None]:
# threshold

In [None]:
# %%time

# from sklearn.preprocessing import Binarizer
# from sklearn.preprocessing import MultiLabelBinarizer

# from sklearn.metrics import f1_score
# for label in threshold.keys(): 
#     actual_tag = label.replace('_predict','')
#     y_test_label = np.array(list(temp_df['actual_label'].values))
#     thresholds = np.arange(0.02, 1, 0.02)
#     t_opt = 0
#     score_max = 0
#     for t in thresholds:
#         temp_df['pred_thres_bin']=temp_df.apply(lambda x: get_pred_thres_arr(x,actual_tag,t),axis=1)
#         y_pred_temp= np.array(list(temp_df['pred_thres_bin'].values))
#         score = f1_score(y_test_label, y_pred_temp, average='weighted')  # f1 score
#         if (score > score_max):
#             score_max = score
#             t_opt = t
#     print("Class: ", label, "\t Threshold: ", t_opt)