This simple notebook shows how *detoxify* package can be used in an offline mode.

**Changelog:**
* V3 - specified a device to use
* V2 - max_len can be now set to a desired  value

In [None]:
!cp -r ../input/detoxify/detoxify-master detoxify
!pip install -q ./detoxify
!rm -rf ./detoxify

In [None]:
from transformers import AutoTokenizer
from detoxify import Detoxify
import pandas as pd
from tqdm import tqdm

In [None]:
max_len = 300
huggingface_config_path = '../input/bert-base-uncased'
detox = Detoxify(model_type='original',  
                 checkpoint='../input/detoxify-models/toxic_original-c1212f89.ckpt',
                 device='cpu',
                 huggingface_config_path=huggingface_config_path)

# A little trick allowing us to set max_len
detox.tokenizer = AutoTokenizer.from_pretrained(huggingface_config_path,
                    local_files_only=True,
                    model_max_length=max_len)

results = detox.predict('I am not toxic, sorry!')
print(results)

In [None]:
valid = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
comments = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
# Asthetics
import warnings
import sklearn.exceptions

# General
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
import pandas as pd
import numpy as np
import os
import re
import random
import gc
import glob
pd.set_option('display.max_columns', None)
np.seterr(divide='ignore', invalid='ignore')
gc.enable()



def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text
tqdm.pandas()
comments['text'] = comments['text'].progress_apply(text_cleaning)

In [None]:
a, b, c, d, e, f = [0.02680548, 0.96467853, 0.13336812, 0.35485912, 0.10866255, 0.31102502]

def make_pred(row):
    pred = detox.predict(row)
    return (
        a*pred['toxicity']
        + b*pred['severe_toxicity']
        + c*pred['obscene']
        + d*pred['threat']
        + e*pred['insult']
        + f*pred['identity_attack']
    )

sub = pd.DataFrame()
sub["comment_id"] = comments["comment_id"]
sub["score"] = comments['text'].apply(lambda x: make_pred(x))
sub.to_csv('submission.csv',index=False)
sub