## Clarification

To debug the code I use **CUT_DATA** / **CUT_TEXT** options, which allows me to reduce the code running time and check if it works.

## Versions

**Version 2**: I didn't use the accelerator and take only 50% of the data (CUT_DATA option).

**Version 1**: I used the GPU accelerator. This made it possible to reduce the operating time by 10-12 times.

* predict_score_data - - - - Wall time: **2min 13s**
* predict_less_toxic_data - Wall time: **8min 55s**
* predict_train_data - - - - Wall time: **46min 29s**

# 1. Import & Def & Load & Set

In [None]:
%%capture
!pip install detoxify

In [None]:
import os
import pandas as pd

from detoxify import Detoxify

import toxic_comments_utilities as tc

In [None]:
def get_predict_data(data: pd.Series) -> pd.DataFrame:
    """ Create toxic data by detoxify model. """
    result = []
    
    predict_labels = model.class_names
    
    for text in data.values:
        result.append(list(model.predict(text).values()))
        
    return pd.DataFrame.from_records(result,
                                     index=data.index,
                                     columns=predict_labels)

In [None]:
path_comments_to_score = "../input/jigsaw-toxic-severity-rating/comments_to_score.csv"
path_validation_data = "../input/jigsaw-toxic-severity-rating/validation_data.csv"
path_train_data = "./train.csv"  # *.zip Toxic Comment Classification Challenge

if not os.path.isfile(path_train_data):
    !unzip ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
    
raw_score_data = pd.read_csv(path_comments_to_score).rename(
                                                columns={'comment_id': 'id'})
raw_valid_data = pd.read_csv(path_validation_data)
raw_train_data = pd.read_csv(path_train_data).rename(
                                                columns={'comment_text': 'text'})

In [None]:
USE_GPU = False  # with/without device='cuda'
CUT_DATA = 0.5   # tc.get_samples(raw_score_data, CUT_DATA)
CUT_TEXT = None  # .apply(tc.shorten_text, max_len=CUT_TEXT)

SAVE_RESULT = False

print(raw_score_data.shape)
print(raw_valid_data.shape)
print(raw_train_data.shape)

# 2. Get predicts

In [None]:
%%capture

if USE_GPU:
    # to specify the device the model will be allocated on (defaults to cpu),
    # accepts any torch.device input
    model = Detoxify('original', device='cuda')
else:
    # (defaults to cpu)
    model = Detoxify('original')

In [None]:
predicts_dict = model.predict("I'll tell you about toxicity labels.")
pd.DataFrame.from_dict(predicts_dict, orient='index', columns=['predict'])

## 2.1. Create predict_score_data

In [None]:
if CUT_DATA:
    score_data = tc.get_samples(raw_score_data, CUT_DATA)
else:
    score_data = raw_score_data.copy()

if CUT_TEXT:
    score_data['text'] = score_data['text'].apply(tc.shorten_text, max_len=CUT_TEXT)

print(score_data['text'].str.len().max())
print(score_data.shape)

In [None]:
tc.get_data_profile(score_data["text"])

In [None]:
%%time
predict_score_data = score_data.join(
    get_predict_data(score_data['text'])
)

In [None]:
predict_score_data

In [None]:
if SAVE_RESULT:
    predict_score_data.to_csv('predict_score_data.csv', index_label='index')
    
print(SAVE_RESULT)

## 2.1. Create predict_[ less|more ]_toxic_data

In [None]:
if CUT_DATA:
    valid_data = tc.get_samples(raw_valid_data, CUT_DATA)
else:
    valid_data = raw_valid_data.copy()

if CUT_TEXT:
    valid_data['less_toxic'] = valid_data['less_toxic'].apply(tc.shorten_text, max_len=CUT_TEXT)
    valid_data['more_toxic'] = valid_data['more_toxic'].apply(tc.shorten_text, max_len=CUT_TEXT)

print(valid_data['less_toxic'].str.len().max())
print(valid_data.shape)

In [None]:
is_duplicate = valid_data.duplicated(subset=['less_toxic', 'more_toxic'])

less_toxic_data = pd.DataFrame({'worker': valid_data['worker'],
                                'text':   valid_data['less_toxic'],
                                'double': is_duplicate.astype(int)})

more_toxic_data = pd.DataFrame({'worker': valid_data['worker'],
                                'text':   valid_data['more_toxic'],
                                'double': is_duplicate.astype(int)})

In [None]:
tc.get_data_profile(less_toxic_data["text"])

In [None]:
tc.get_data_profile(more_toxic_data["text"])

In [None]:
%%time
predict_less_toxic_data = less_toxic_data.join(
    get_predict_data(less_toxic_data['text'])
)

In [None]:
predict_less_toxic_data

In [None]:
%%time
predict_more_toxic_data = more_toxic_data.join(
    get_predict_data(more_toxic_data['text'])
)

In [None]:
predict_more_toxic_data

In [None]:
if SAVE_RESULT:
    predict_less_toxic_data.to_csv('predict_less_toxic_data.csv', index_label='index')
    predict_more_toxic_data.to_csv('predict_more_toxic_data.csv', index_label='index')
    
print(SAVE_RESULT)

## 2.3. Create predict_train_data

In [None]:
if CUT_DATA:
    train_data = tc.get_samples(raw_train_data, CUT_DATA)
else:
    train_data = raw_train_data.copy()

cols_dict = {'obscene': '-obscene-',
             'threat': '-threat-',
             'insult': '-insult-'}
train_data = train_data.rename(columns=cols_dict)

if CUT_TEXT:
    train_data['text'] = train_data['text'].apply(tc.shorten_text, max_len=CUT_TEXT)

print(train_data['text'].str.len().max())
print(train_data.shape)

In [None]:
tc.get_data_profile(train_data["text"])

In [None]:
%%time
predict_train_data = train_data.join(
    get_predict_data(train_data['text'])
)

In [None]:
predict_train_data

In [None]:
if SAVE_RESULT:
    predict_train_data.to_csv('predict_train_data.csv', index_label='index')
    
print(SAVE_RESULT)