# 1. Import & Load

In [None]:
import numpy as np
import pandas as pd
pd.set_option("max_colwidth", 90)

from string import punctuation
from bs4 import BeautifulSoup
import re
import calendar
import textwrap

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
stop_words = stopwords.words('english') \
                + ["can't"] \
                + [x.lower() for x in calendar.month_name[1:]] \
                + [x.lower() for x in calendar.month_abbr[1:]] \
                + ['utc', 'wikipedia', 'wiki']


def text_preprocessor(text: str, max_len: int = 600) -> str:
    """ Cutting and cleaning the text. """
    text = text.strip()
    text = textwrap.shorten(text, width=max_len, placeholder='')
    text = text.replace('\n', ' ')
    text = text.lower()

    text = re.sub(r'image|file|jpg|jpeg', '', text)
    text = re.sub(r'\d{1,4}\.\d{1,4}\.\d{1,4}\.\d{1,4}', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text()
    
    text_cleaned = [w.strip(punctuation) for w in text.split() if not w.isdigit()]
    text = " ".join(text_cleaned)
    
    return text


def toxic_preprocessor(string: str) -> float:
    """ Get value of toxic text. """
    return sum(model.predict(string).values())

In [None]:
validation_data_path = "../input/jigsaw-toxic-severity-rating/validation_data.csv"
validation_data = pd.read_csv(validation_data_path)
validation_data.shape

In [None]:
%%capture
!pip install detoxify

In [None]:
from detoxify import Detoxify

model = Detoxify('original')

In [None]:
# model.predict("this article sucks woo woo wooooooo")
# {'toxicity': 0.9875552,
#  'severe_toxicity': 0.05354331,
#  'obscene': 0.924013,
#  'threat': 0.0024955713,
#  'insult': 0.18830173,
#  'identity_attack': 0.0021805398}
# 2.1580893732607365

In [None]:
# model.predict("what wher is your sexy pic gone from your main page put it back")
# {'toxicity': 0.75664175,
#  'severe_toxicity': 0.001743382,
#  'obscene': 0.12342469,
#  'threat': 0.001059845,
#  'insult': 0.06852302,
#  'identity_attack': 0.0021121758}
# 0.9535048543475568

# 2. Get score

In [None]:
# shorten data to speed up debugging
validation_data.loc[::100].shape

In [None]:
check_data = validation_data.copy()
check_data

In [None]:
%%time
clean_data = check_data.copy()
clean_data['less_toxic'] = clean_data['less_toxic'].apply(text_preprocessor)
clean_data['more_toxic'] = clean_data['more_toxic'].apply(text_preprocessor)

In [None]:
clean_data

In [None]:
%%time
toxic_data = check_data.copy()  # === without text_preprocessor ===
toxic_data['less_toxic'] = toxic_data['less_toxic'].apply(toxic_preprocessor)
toxic_data['more_toxic'] = toxic_data['more_toxic'].apply(toxic_preprocessor)

In [None]:
toxic_data

In [None]:
# 0.6912780656303973 < validation_data
toxic_data.eval('less_toxic < more_toxic').mean()

# 3. Check result

In [None]:
correct_predict = toxic_data.eval('less_toxic < more_toxic')

In [None]:
diff_toxic_data = toxic_data.loc[~correct_predict] \
                    .assign(diff=lambda x: x.less_toxic - x.more_toxic)
diff_toxic_data

In [None]:
diff_toxic_data['diff'].hist(bins=100, figsize=(12,6));

In [None]:
clean_data.loc[~correct_predict]

In [None]:
# The indicies of incorrect preditions
clean_data.loc[~correct_predict].reset_index()['index'].hist(bins=100, figsize=(12,6));

In [None]:
pd.DataFrame({'all': clean_data.loc[:, 'worker'].value_counts(),
              'correct': clean_data.loc[correct_predict, 'worker'].value_counts(),
              'incorrect': clean_data.loc[~correct_predict, 'worker'].value_counts()}) \
                    .fillna(0).sort_values(by='incorrect', ascending=False) \
                    .rename_axis(index='worker', columns='predict')