In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
print(os.listdir("/kaggle/input/quora-insincere-questions-classification/"))

In [None]:
!unzip /kaggle/input/quora-insincere-questions-classification/embeddings.zip 

In [None]:
print(os.listdir("/kaggle/working/"))

In [None]:
# !pip install contractions

In [None]:
from __future__ import absolute_import

import re
import logging
import itertools
import unicodedata
# import contractions

from bs4 import BeautifulSoup

class TextCleaningUtils:
    '''
        This class contains implementations of various text cleaning operations (Static Methods)
    '''


    cleaning_regex_map = {
        'web_links': r'(?i)(?:(?:http(?:s)?:)|(?:www\.))\S+',
        'special_chars': r'[^a-zA-Z0-9\s\.,!?;:]+',
        'redundant_spaces': r'\s\s+',
        'redundant_newlines': r'[\r|\n|\r\n]+',
        'twitter_handles': r'[#@]\S+',
        'punctuations': r'[\.,!?;:]+'
    }

    @staticmethod
    def clean_text_from_regex(text, text_clean_regex):
        '''
            Follow a particular cleaning expression, provided
            as an input by an user to clean the text.
        '''

        text = text_clean_regex.sub(' ', text).strip()
        return text

    @staticmethod
#     def replace_contractions(text):
#         '''
#             Replace contractions in string of text
#         '''

#         return contractions.fix(text)

    @staticmethod
    def strip_html(text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    @staticmethod
    def remove_special_chars(text):
        '''
            Replace any special character provided as default,
            which is present in the text with space
        '''

        special_chars_regex = re.compile(TextCleaningUtils.cleaning_regex_map['special_chars'])
        text = TextCleaningUtils.clean_text_from_regex(text, special_chars_regex)
        return text

    @staticmethod
    def remove_redundant_spaces(text):
        '''
            Remove any redundant space provided as default,
            that is present in the text.
        '''

        redundant_spaces_regex = re.compile(
            TextCleaningUtils.cleaning_regex_map['redundant_spaces'])
        text = TextCleaningUtils.clean_text_from_regex(text, redundant_spaces_regex)
        return text

    @staticmethod
    def remove_web_links(text):
        '''
            Removes any web link that follows a particular default expression,
            present in the text.
        '''

        web_links_regex = re.compile(TextCleaningUtils.cleaning_regex_map['web_links'])
        text = TextCleaningUtils.clean_text_from_regex(text, web_links_regex)
        return text

    @staticmethod
    def remove_twitter_handles(text):
        '''
            Removes any twitter handle present in the text.
        '''

        twitter_handles_regex = re.compile(TextCleaningUtils.cleaning_regex_map['twitter_handles'])
        text = TextCleaningUtils.clean_text_from_regex(text, twitter_handles_regex)
        return text

    @staticmethod
    def remove_redundant_newlines(text):
        '''
            Removes any redundant new line present in the text.
        '''

        redundant_newlines_regex = re.compile(
            TextCleaningUtils.cleaning_regex_map['redundant_newlines'])
        text = TextCleaningUtils.clean_text_from_regex(text, redundant_newlines_regex)
        return text

    @staticmethod
    def remove_punctuations(text):
        '''
            Removes any punctuation that follows the default expression, in the text.
        '''

        remove_punctuations_regex = re.compile(TextCleaningUtils.cleaning_regex_map['punctuations'])
        text = TextCleaningUtils.clean_text_from_regex(text, remove_punctuations_regex)
        return text

    @staticmethod
    def remove_exaggerated_words(text):
        '''
            Removes any exaggerated word present in the text.
        '''

        return ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

    @staticmethod
    def replace_multiple_chars(text):
        '''
            Replaces multiple characters present in the text.
        '''

        char_list = ['.', '?', '!', '#', '$', '/', '@', '*', '(', ')', '+']
        final_text = ''
        for i in char_list:
            if i in text:
                pattern = "\\" + i + '{2,}'
                repl_str = i.replace("\\", "")
                text = re.sub(pattern, repl_str, text)
                final_text = ' '.join(text.split())
        return final_text

    @staticmethod
    def replace_sign(text):
        '''
            Replaces any sign with words like & with 'and', in the text.
        '''
        sign_list = {'&': ' and ', '/': ' or ', '\xa0': ' '}
        final_text = ''
        for i in sign_list:
            if i in text:
                text = re.sub(i, sign_list[i], text)
                final_text = ' '.join(text.split())
        return final_text

    @staticmethod
    def remove_accented_char(text):
        text = unicodedata.normalize('NFD', text) \
            .encode('ascii', 'ignore') \
            .decode("utf-8")
        return str(text)

    @staticmethod
    def replace_characters(text, replace_map):
        '''
            Replaces any character custom provided by an user.
        '''

        for char, replace_val in replace_map.items():
            text = text.replace(char, replace_val)
        return text


# class TextCleaningRecipes:
#     """
#         This class contains the recipes for a set of standard text cleaning operations

#     """

#     DEFAULT_OPERATIONS = ['replace_contractions', 'remove_web_links', 'remove_special_chars',
#                           'remove_redundant_newlines', 'remove_redundant_spaces']

#     OPERATIONS_MAP = {
#         'replace_contractions': TextCleaningUtils.replace_contractions,
#         'remove_web_links': TextCleaningUtils.remove_web_links,
#         'remove_twitter_handles': TextCleaningUtils.remove_twitter_handles,
#         'replace_characters': TextCleaningUtils.replace_characters,
#         'remove_special_chars': TextCleaningUtils.remove_special_chars,
#         'remove_punctuations': TextCleaningUtils.remove_punctuations,
#         'remove_redundant_newlines': TextCleaningUtils.remove_redundant_newlines,
#         'remove_redundant_spaces': TextCleaningUtils.remove_redundant_spaces
#     }

#     OPERATIONS_ORDER = ['replace_contractions', 'remove_web_links', 'remove_twitter_handles',
#                         'replace_characters',
#                         'remove_special_chars', 'remove_punctuations',
#                         'remove_redundant_newlines', 'remove_redundant_spaces']

#     @staticmethod
#     def exec_cleaning(text_values, config):
#         '''
#             This method executes various cleaning techniques together
#         '''

#         operations = TextCleaningRecipes.get_operations(config)
#         logging.info("Executing %s Operations", ', '.join(operations))

#         cleaning_ops = []
#         for _op in operations:
#             op_func = TextCleaningRecipes.OPERATIONS_MAP[_op]
#             cleaning_ops.append(op_func)

#         c_text_values = []
#         for text in text_values:

#             if text is None:
#                 c_text = ''
#             else:
#                 c_text = str(text)

#             c_text = c_text.replace(')', ') ')

#             for _op in cleaning_ops:
#                 if 'replace_characters' in _op.__name__:

#                     c_text = _op(c_text, config['replace_characters'])
#                 else:
#                     c_text = _op(c_text)

#             c_text_values.append(c_text)

#         return c_text_values

#     @staticmethod
#     def get_operations(config):

#         operations = []
#         if len(config) == 0:
#             operations = TextCleaningRecipes.DEFAULT_OPERATIONS
#             return operations

#         for _op in TextCleaningRecipes.OPERATIONS_ORDER:
#             if _op in config and config[_op]:
#                 operations.append(_op)

#         if not operations:
#             operations = TextCleaningRecipes.DEFAULT_OPERATIONS

#         logging.info("Operations: %s", operations)
#         return operations

In [None]:
import pandas as pd
import numpy as np
import os

# fastext
import fasttext

from sklearn.model_selection import train_test_split

from tabulate import tabulate

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')

In [None]:
submission_df = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')

In [None]:
df.head()

In [None]:
df.target.value_counts()

In [None]:
df.shape

In [None]:
def clean_data(df,col_to_clean):
#   df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.replace_contractions)
#   Remove Smiles and special chars
#   df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.transform_emojis)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_special_chars)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_spaces)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_punctuations)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_exaggerated_words)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_newlines)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_twitter_handles)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_web_links)
#   df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.replace_sign)
  df[col_to_clean] = df[col_to_clean].astype(str)
  df[col_to_clean] = df[col_to_clean].str.lower()
  return df

In [None]:
clean_df = clean_data(df,'question_text')
submission_df = clean_data(submission_df,'question_text')

In [None]:
clean_df.shape

In [None]:
clean_df.head()

In [None]:
map_={0:'__label__0',
     1:'__label__1',
     '0.0':'__label__0',
     '1.0':'__label__1'}
clean_df['label'] = clean_df['target'].map(map_)

In [None]:
clean_df.head()

In [None]:
train_df, test_df = train_test_split(clean_df, random_state=50, stratify=clean_df['label'], test_size=0.1)

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
def to_fwf(df, fname):
    content = tabulate(df.values.tolist(), list(df.columns), tablefmt="plain")
    content = content[(content.find('\n') + 1):]
    open(fname, "w").write(content)
pd.DataFrame.to_fwf = to_fwf

In [None]:
train_df=train_df[['label','question_text']]

In [None]:
train_df.head()

In [None]:
config={
    'data_folder':'/kaggle/working/',
    'training_csv_file':'/kaggle/input/quora-insincere-questions-classification/train.csv',
    'training_fwf_file':'train_fwf.train',
    'model_folder': '/kaggle/working/wiki-news-300d-1M/',
    'pretrained_model': 'wiki-news-300d-1M.vec',
    'model_version': 'm1'
}

In [None]:
def to_fwf(df, fname):
    content = tabulate(df.values.tolist(), list(df.columns), tablefmt="plain")
    content = content[(content.find('\n') + 1):]
    open(fname, "w").write(content)
pd.DataFrame.to_fwf = to_fwf

In [None]:
train_df=train_df[['label','question_text']]

In [None]:
config['data_folder']+config['training_fwf_file']

In [None]:
%%time
train_df.to_fwf(config['data_folder']+config['training_fwf_file'])

In [None]:
train_fn = os.path.join(config['data_folder'], config['training_fwf_file'])

In [None]:
pretrainedvec_fn = os.path.join(config['model_folder'], config['pretrained_model'])
model_fn = os.path.join(config['data_folder'], 'models','{}_{}.bin'.format('qoura_question',config['model_version']))

In [None]:
print('train_fn:{} \n model_fn:{} \n pretrainedvec_fn:{}'.format(train_fn,model_fn,pretrainedvec_fn))

In [None]:
%%time
model = fasttext.train_supervised(input=train_fn,
                                  pretrainedVectors=pretrainedvec_fn,
                                  dim=300, 
                                  wordNgrams=2, 
                                  minCount=3, 
                                  epoch=20)

In [None]:
# !ls /kaggle/working/

In [None]:
def get_proba(x):
    if x[1][0] + x[1][1] <1:
        print("Not a classification")
    if x[0][0] == '__label__0':
        return 1 - x[1][0] if x[1][0]<= 1 else 0
    elif x[0][0] == '__label__1':
        return x[1][0] if x[1][0]<= 1 else 1

train_df['threshold'] = train_df['question_text'].apply(lambda x: get_proba(model.predict(x,k=2)))
test_df['threshold'] = test_df['question_text'].apply(lambda x: get_proba(model.predict(x,k=2)))
train_df['debug_threshold'] = train_df['question_text'].apply(lambda x: model.predict(x,k=2))
test_df['debug_threshold'] = test_df['question_text'].apply(lambda x: model.predict(x,k=2))

In [None]:
train_df.head(10)

In [None]:
train_df['target'] = train_df['label'].apply(lambda x: int(x.replace('__label__','')))

In [None]:
train_df['predict_threshold'] = train_df['threshold'].apply(lambda x: 0 if x <=0.06 else 1 )
test_df['predict_threshold'] = test_df['threshold'].apply(lambda x: 0 if x <=0.06 else 1)

In [None]:
confusion_matrix(train_df['target'],train_df['predict_threshold'])

In [None]:
confusion_matrix(test_df['target'],test_df['predict_threshold'])

In [None]:
print(classification_report(train_df['target'],train_df['predict_threshold']))

In [None]:
print(classification_report(test_df['target'],test_df['predict_threshold']))

In [None]:
# submission file

In [None]:
submission_sample = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/sample_submission.csv')

In [None]:
submission_sample.head()

In [None]:
submission_df.head()

In [None]:
submission_df['threshold'] = submission_df['question_text'].apply(lambda x: get_proba(model.predict(x,k=2)))

In [None]:
submission_df['prediction'] = submission_df['threshold'].apply(lambda x: 0 if x <=0.06 else 1 )

In [None]:
submission_df.shape

In [None]:
# 375806 - 315271-22660

In [None]:
submission_df['prediction'] = submission_df['prediction'].replace([np.inf, -np.inf, np.nan], 2)

In [None]:
submission_df['prediction'] = submission_df['prediction'].fillna(2)

In [None]:
submission_df[submission_df['prediction'] == 2]

In [None]:
submission_df['debug_threshold'] = submission_df['question_text'].apply(lambda x: model.predict(x,k=2))

In [None]:
submission_df['prediction'].value_counts()

In [None]:
submission_df['prediction'].value_counts()

In [None]:
submission_df['prediction'] = submission_df['prediction'].astype(int)

In [None]:
submission_df[submission_sample.columns]

In [None]:
submission_df[submission_sample.columns].to_csv('submission.csv', index=False)