In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from detoxify import Detoxify

from scipy.stats import rankdata

import pickle

### Feature extraction

In [3]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def get_pretrained_detoxify_model ( model_type, device):
    detox = Detoxify(model_type= model_type, device=device)
    return detox


def detoxify_predict (model_type, texts, batch_size, device):
    model = get_pretrained_detoxify_model ( model_type, device)
    
    for i ,chunk in enumerate(chunks(texts, batch_size)):
        if i == 0:
            results = pd.DataFrame( model.predict(chunk) )
        else:
            results = results.append(pd.DataFrame( model.predict(chunk) ))  

    results.columns = [f"{model_type}_{c}" for c in results.columns ]        
            
    return results.reset_index(drop = True)

In [4]:
def tf_idf_feature (texts, feature_name, path, folds=5):
    pred = np.zeros( (len(texts), ) )
    for fold in range(folds):
        model = pickle.load(open(f"{path}/{feature_name}_{fold}.pkl", 'rb'))
        vec = pickle.load(open(f"{path}/{feature_name}_vec_{fold}.pkl", 'rb'))
        X = vec.transform ( texts )
        pred += model.predict_proba ( X )[:,1]
    
    return pred/folds

### Load Test Data

In [5]:
df_submisison = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

texts = list(df_submisison["text"].values)

# Detoxify features

original_small = detoxify_predict ("original-small", texts, device="cuda", batch_size=64 ) 
unbiased_small = detoxify_predict ("unbiased-small", texts, device="cuda", batch_size=64 ) 
original = detoxify_predict ("original", texts, device="cuda", batch_size=64 )
unbiased = detoxify_predict ("unbiased", texts, device="cuda", batch_size=64 )
multilingual = detoxify_predict ("multilingual", texts, device="cuda", batch_size=64 )

df_submisison = pd.concat ([df_submisison, original, unbiased, multilingual, original_small, unbiased_small], axis=1)

# tf-idf features (jc)

features = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
path = "../models/"
for feat in features:
    feat_name = f"jc_tfidf_{feat}"
    print(feat_name)
    df_submisison[feat_name] = tf_idf_feature(texts, feat_name, path=path, folds=5)

df_submisison.head()


jc_tfidf_toxic
jc_tfidf_severe_toxic
jc_tfidf_obscene
jc_tfidf_threat
jc_tfidf_insult
jc_tfidf_identity_hate


Unnamed: 0,comment_id,text,original_toxicity,original_severe_toxicity,original_obscene,original_threat,original_insult,original_identity_attack,unbiased_toxicity,unbiased_severe_toxicity,...,unbiased-small_identity_attack,unbiased-small_insult,unbiased-small_threat,unbiased-small_sexual_explicit,jc_tfidf_toxic,jc_tfidf_severe_toxic,jc_tfidf_obscene,jc_tfidf_threat,jc_tfidf_insult,jc_tfidf_identity_hate
0,114890,"""\n \n\nGjalexei, you asked about whether ther...",0.000581,0.000126,0.000186,0.000124,0.000171,0.000145,0.01576,1.6e-05,...,5.4e-05,0.001348,2.1e-05,2.1e-05,0.026839,0.038405,0.051513,0.082425,0.0395,0.05015
1,732895,"Looks like be have an abuser , can you please ...",0.003561,8.3e-05,0.000253,9.1e-05,0.000342,0.00016,0.025412,4e-06,...,0.001129,0.020714,0.001549,0.000331,0.047661,0.092012,0.059723,0.138585,0.08932,0.111
2,1139051,I confess to having complete (and apparently b...,0.003551,9.2e-05,0.000293,0.000101,0.000279,0.00018,0.074255,8.3e-05,...,0.002816,0.052784,0.000886,0.102682,0.097493,0.03862,0.068228,0.073889,0.063623,0.070715
3,1434512,"""\n\nFreud's ideas are certainly much discusse...",0.002331,9.7e-05,0.000258,0.000108,0.000251,0.000178,0.047354,2.7e-05,...,0.00018,0.214475,9.6e-05,9.9e-05,0.072167,0.017072,0.038173,0.046741,0.032561,0.045894
4,2084821,It is not just you. This is a laundry list of ...,0.349564,0.000476,0.03314,0.000432,0.03785,0.000623,0.971578,3.2e-05,...,0.000562,0.976723,0.000397,0.000366,0.756514,0.164859,0.432535,0.199114,0.486088,0.297186


### Predict

In [6]:
cols = ['original_toxicity', 'original_severe_toxicity', 
        'unbiased_toxicity', 'unbiased_severe_toxicity', 'unbiased_identity_attack', 'unbiased_insult', 
        'multilingual_toxicity', 'multilingual_sexual_explicit', 
        'original-small_toxicity', 
        'unbiased-small_severe_toxicity', 'unbiased-small_obscene', 
        'jc_tfidf_toxic']

pred = np.zeros( (df_submisison.shape[0], ))
for col in cols:
    pred += rankdata(df_submisison[col].values, method='ordinal')

df_submisison['score'] = rankdata(pred, method='ordinal')

df_submisison[['comment_id', 'score']].to_csv("submission.csv", index=False)

df_submisison.head()

Unnamed: 0,comment_id,text,original_toxicity,original_severe_toxicity,original_obscene,original_threat,original_insult,original_identity_attack,unbiased_toxicity,unbiased_severe_toxicity,...,unbiased-small_insult,unbiased-small_threat,unbiased-small_sexual_explicit,jc_tfidf_toxic,jc_tfidf_severe_toxic,jc_tfidf_obscene,jc_tfidf_threat,jc_tfidf_insult,jc_tfidf_identity_hate,score
0,114890,"""\n \n\nGjalexei, you asked about whether ther...",0.000581,0.000126,0.000186,0.000124,0.000171,0.000145,0.01576,1.6e-05,...,0.001348,2.1e-05,2.1e-05,0.026839,0.038405,0.051513,0.082425,0.0395,0.05015,132
1,732895,"Looks like be have an abuser , can you please ...",0.003561,8.3e-05,0.000253,9.1e-05,0.000342,0.00016,0.025412,4e-06,...,0.020714,0.001549,0.000331,0.047661,0.092012,0.059723,0.138585,0.08932,0.111,655
2,1139051,I confess to having complete (and apparently b...,0.003551,9.2e-05,0.000293,0.000101,0.000279,0.00018,0.074255,8.3e-05,...,0.052784,0.000886,0.102682,0.097493,0.03862,0.068228,0.073889,0.063623,0.070715,2375
3,1434512,"""\n\nFreud's ideas are certainly much discusse...",0.002331,9.7e-05,0.000258,0.000108,0.000251,0.000178,0.047354,2.7e-05,...,0.214475,9.6e-05,9.9e-05,0.072167,0.017072,0.038173,0.046741,0.032561,0.045894,732
4,2084821,It is not just you. This is a laundry list of ...,0.349564,0.000476,0.03314,0.000432,0.03785,0.000623,0.971578,3.2e-05,...,0.976723,0.000397,0.000366,0.756514,0.164859,0.432535,0.199114,0.486088,0.297186,4087
