In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from detoxify import Detoxify

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def get_pretrained_detoxify_model ( model_type, device):
    detox = Detoxify(model_type= model_type, device=device)
    return detox


def detoxify_predict (model_type, texts, batch_size, device):
    model = get_pretrained_detoxify_model ( model_type, device)
    
    for i ,chunk in enumerate(chunks(texts, batch_size)):
        if i == 0:
            results = pd.DataFrame( model.predict(chunk) )
        else:
            results = results.append(pd.DataFrame( model.predict(chunk) ))  

    results.columns = [f"{model_type}_{c}" for c in results.columns ]        
            
    return results.reset_index(drop = True)

def detoxify_fe (df, batch_size, device, text_column ):
    texts = list(df[text_column].values)

    original_small = detoxify_predict ("original-small", texts, device="cuda", batch_size=64 ) 
    unbiased_small = detoxify_predict ("unbiased-small", texts, device="cuda", batch_size=64 ) 

    original = detoxify_predict ("original", texts, device="cuda", batch_size=64 )
    unbiased = detoxify_predict ("unbiased", texts, device="cuda", batch_size=64 )
    multilingual = detoxify_predict ("multilingual", texts, device="cuda", batch_size=64 )

    df = pd.concat ([df, original, unbiased, multilingual, original_small, unbiased_small], axis=1)
    
    return df

### VAL Dataset

In [3]:
%%time

df_val_text = pd.read_csv("../processed/validation_texts.csv")

df_val_text = detoxify_fe (df_val_text, batch_size = 64, device = "cuda", text_column = "text" )
df_val_text.to_csv("../processed/valid_text_detoxify_fe.csv", index=False)

df_val_text.head(5)


Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.1.2/original-albert-0e1d6498.ckpt" to /root/.cache/torch/hub/checkpoints/original-albert-0e1d6498.ckpt


  0%|          | 0.00/44.6M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.1.2/unbiased-albert-c8519128.ckpt" to /root/.cache/torch/hub/checkpoints/unbiased-albert-c8519128.ckpt


  0%|          | 0.00/44.6M [00:00<?, ?B/s]

Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.1-alpha/toxic_original-c1212f89.ckpt" to /root/.cache/torch/hub/checkpoints/toxic_original-c1212f89.ckpt


  0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.3-alpha/toxic_debiased-c7548aa0.ckpt" to /root/.cache/torch/hub/checkpoints/toxic_debiased-c7548aa0.ckpt


  0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.4-alpha/multilingual_debiased-0b549669.ckpt" to /root/.cache/torch/hub/checkpoints/multilingual_debiased-0b549669.ckpt


  0%|          | 0.00/1.04G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

CPU times: user 23min 9s, sys: 12.8 s, total: 23min 22s
Wall time: 26min 18s


Unnamed: 0,text,original_toxicity,original_severe_toxicity,original_obscene,original_threat,original_insult,original_identity_attack,unbiased_toxicity,unbiased_severe_toxicity,unbiased_obscene,...,original-small_threat,original-small_insult,original-small_identity_attack,unbiased-small_toxicity,unbiased-small_severe_toxicity,unbiased-small_obscene,unbiased-small_identity_attack,unbiased-small_insult,unbiased-small_threat,unbiased-small_sexual_explicit
0,Whatever gave you the impression that I have t...,0.504896,0.001695,0.140739,0.000415,0.201055,0.003484,0.888242,0.000921,0.270766,...,0.002317,0.086861,0.001698,0.640444,1.5e-05,0.005329,0.000749,0.608529,0.000178,0.002871
1,Jesus god you're fucking retarded \n\nwhat a ...,0.998353,0.336975,0.984182,0.004737,0.973352,0.052704,0.996909,0.317745,0.986546,...,0.037964,0.945297,0.056213,0.99798,0.007335,0.968334,0.04188,0.987184,0.026711,0.067629
2,"January 4, 2007 \nIf you continue to damage a...",0.00331,8.8e-05,0.000234,0.000134,0.000264,0.000176,0.007568,2.2e-05,0.000669,...,9.3e-05,0.00031,0.000193,0.000753,2e-06,2.8e-05,4.8e-05,0.000336,3.7e-05,1.9e-05
3,"""\n\nDC101 IS HIS FUCKING EMPLOYER!!! IF THAT ...",0.996157,0.194925,0.973184,0.052918,0.810583,0.009025,0.9954,0.157432,0.975952,...,0.001769,0.484343,0.005027,0.944791,0.000214,0.738079,0.003072,0.707245,0.002005,0.006922
4,"Don't see any attack, unlike the one YOU made ...",0.011751,9.9e-05,0.000452,0.000241,0.000484,0.000244,0.027602,2.3e-05,0.000997,...,0.000227,0.001613,0.000477,0.000457,2e-06,2e-05,4.3e-05,0.000208,2.3e-05,1.3e-05


### RUD Dataset

In [4]:
df_rud_text = pd.read_csv("../processed/rud_text.csv")
df_rud_text = detoxify_fe (df_rud_text, batch_size = 64, device = "cuda", text_column = "text" )
df_rud_text.to_csv("../processed/rud_text_detoxify_fe.csv", index=False)

df_rud_text.head(5)


Unnamed: 0,comment_id,text,offensiveness_score,cluster,kfold,original_toxicity,original_severe_toxicity,original_obscene,original_threat,original_insult,...,original-small_threat,original-small_insult,original-small_identity_attack,unbiased-small_toxicity,unbiased-small_severe_toxicity,unbiased-small_obscene,unbiased-small_identity_attack,unbiased-small_insult,unbiased-small_threat,unbiased-small_sexual_explicit
0,cza1q49,> The difference in average earnings between m...,-0.083,69,0,0.000635,0.000118,0.000174,0.000114,0.000179,...,7.3e-05,0.000148,0.000108,0.002585,0.000157,0.000236,0.001717,0.000655,0.000296,0.000482
1,cza1wdh,"The myth is that the ""gap"" is entirely based o...",-0.022,69,1,0.001334,0.000106,0.000213,9.7e-05,0.000194,...,0.00069,0.004645,0.000501,0.014242,1.2e-05,0.000144,0.003541,0.000565,8.8e-05,0.009595
2,cza2bw8,The assertion is that women get paid less for ...,-0.146,69,2,0.335826,0.002243,0.010125,0.001167,0.021552,...,0.000328,0.001609,0.000415,0.008118,9.7e-05,0.000103,0.007975,0.001486,0.000383,0.000506
3,cza2iji,You said in the OP that's not what they're mea...,-0.083,69,1,0.011095,0.000144,0.000612,0.000111,0.000726,...,0.000412,0.002396,0.000585,0.002061,4.9e-05,7.1e-05,0.00113,0.000623,0.00015,0.000318
4,cza2jj3,>Men and women are not payed less for the same...,-0.042,69,4,0.002125,9.9e-05,0.00023,9.4e-05,0.00022,...,0.000282,0.001396,0.000439,0.005752,0.000282,0.000388,0.006308,0.001258,0.000498,0.000839
