In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from detoxify import Detoxify

from scipy.stats import rankdata

### Feature extraction

In [3]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def get_pretrained_detoxify_model ( model_type, device):
    detox = Detoxify(model_type= model_type, device=device)
    return detox


def detoxify_predict (model_type, texts, batch_size, device):
    model = get_pretrained_detoxify_model ( model_type, device)
    
    for i ,chunk in enumerate(chunks(texts, batch_size)):
        if i == 0:
            results = pd.DataFrame( model.predict(chunk) )
        else:
            results = results.append(pd.DataFrame( model.predict(chunk) ))  

    results.columns = [f"{model_type}_{c}" for c in results.columns ]        
            
    return results.reset_index(drop = True)

### Load Test Data

In [4]:
df_submisison = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
texts = list(df_submisison["text"].values)

# extract detoxify features

original_small = detoxify_predict ("original-small", texts, device="cuda", batch_size=64 ) 
unbiased_small = detoxify_predict ("unbiased-small", texts, device="cuda", batch_size=64 ) 
original = detoxify_predict ("original", texts, device="cuda", batch_size=64 )
unbiased = detoxify_predict ("unbiased", texts, device="cuda", batch_size=64 )
multilingual = detoxify_predict ("multilingual", texts, device="cuda", batch_size=64 )

df_submisison = pd.concat ([df_submisison, original, unbiased, multilingual, original_small, unbiased_small], axis=1)


df_submisison.head()


Unnamed: 0,comment_id,text,original_toxicity,original_severe_toxicity,original_obscene,original_threat,original_insult,original_identity_attack,unbiased_toxicity,unbiased_severe_toxicity,...,original-small_threat,original-small_insult,original-small_identity_attack,unbiased-small_toxicity,unbiased-small_severe_toxicity,unbiased-small_obscene,unbiased-small_identity_attack,unbiased-small_insult,unbiased-small_threat,unbiased-small_sexual_explicit
0,114890,"""\n \n\nGjalexei, you asked about whether ther...",0.000581,0.000126,0.000186,0.000124,0.000171,0.000145,0.01576,1.6e-05,...,7.1e-05,0.000141,0.000103,0.001927,3e-06,6.2e-05,5.4e-05,0.001348,2.1e-05,2.1e-05
1,732895,"Looks like be have an abuser , can you please ...",0.003561,8.3e-05,0.000253,9.1e-05,0.000342,0.00016,0.025412,4e-06,...,8.1e-05,0.000251,0.00015,0.040602,6e-06,0.000226,0.001129,0.020714,0.001549,0.000331
2,1139051,I confess to having complete (and apparently b...,0.003551,9.2e-05,0.000293,0.000101,0.000279,0.00018,0.074255,8.3e-05,...,0.000127,0.000439,0.000216,0.225546,0.000128,0.002621,0.002816,0.052784,0.000886,0.102682
3,1434512,"""\n\nFreud's ideas are certainly much discusse...",0.002331,9.7e-05,0.000258,0.000108,0.000251,0.000178,0.047354,2.7e-05,...,0.000246,0.001083,0.000297,0.232061,4e-06,0.000514,0.00018,0.214475,9.6e-05,9.9e-05
4,2084821,It is not just you. This is a laundry list of ...,0.349564,0.000476,0.03314,0.000432,0.03785,0.000623,0.971578,3.2e-05,...,0.001656,0.058189,0.000985,0.986299,5.6e-05,0.001208,0.000562,0.976723,0.000397,0.000366


### Predict

In [5]:
# selected features 

cols = ['original_toxicity', 
        'original_severe_toxicity', 
        'original_identity_attack', 
        'unbiased_toxicity', 
        'unbiased_identity_attack', 
        'unbiased_insult', 
        'multilingual_toxicity', 
        'multilingual_sexual_explicit', 
        'original-small_toxicity',
        'unbiased-small_severe_toxicity']


# compute ranked mean of selected features 
pred = np.zeros( (df_submisison.shape[0], ))
for col in cols:
    pred += rankdata(df_submisison[col].values, method='ordinal')

df_submisison['score'] = rankdata(pred, method='ordinal')

df_submisison[['comment_id', 'score']].to_csv("submission.csv", index=False)

df_submisison.head()

Unnamed: 0,comment_id,text,original_toxicity,original_severe_toxicity,original_obscene,original_threat,original_insult,original_identity_attack,unbiased_toxicity,unbiased_severe_toxicity,...,original-small_insult,original-small_identity_attack,unbiased-small_toxicity,unbiased-small_severe_toxicity,unbiased-small_obscene,unbiased-small_identity_attack,unbiased-small_insult,unbiased-small_threat,unbiased-small_sexual_explicit,score
0,114890,"""\n \n\nGjalexei, you asked about whether ther...",0.000581,0.000126,0.000186,0.000124,0.000171,0.000145,0.01576,1.6e-05,...,0.000141,0.000103,0.001927,3e-06,6.2e-05,5.4e-05,0.001348,2.1e-05,2.1e-05,197
1,732895,"Looks like be have an abuser , can you please ...",0.003561,8.3e-05,0.000253,9.1e-05,0.000342,0.00016,0.025412,4e-06,...,0.000251,0.00015,0.040602,6e-06,0.000226,0.001129,0.020714,0.001549,0.000331,817
2,1139051,I confess to having complete (and apparently b...,0.003551,9.2e-05,0.000293,0.000101,0.000279,0.00018,0.074255,8.3e-05,...,0.000439,0.000216,0.225546,0.000128,0.002621,0.002816,0.052784,0.000886,0.102682,2200
3,1434512,"""\n\nFreud's ideas are certainly much discusse...",0.002331,9.7e-05,0.000258,0.000108,0.000251,0.000178,0.047354,2.7e-05,...,0.001083,0.000297,0.232061,4e-06,0.000514,0.00018,0.214475,9.6e-05,9.9e-05,710
4,2084821,It is not just you. This is a laundry list of ...,0.349564,0.000476,0.03314,0.000432,0.03785,0.000623,0.971578,3.2e-05,...,0.058189,0.000985,0.986299,5.6e-05,0.001208,0.000562,0.976723,0.000397,0.000366,4022
