In [None]:
!mkdir -p  /root/.cache/torch/hub/checkpoints
!mkdir -p  /root/.cache/huggingface/transformers
!mkdir -p  ./detoxify
!cp -r ../input/detoxify-sourcemodels/detoxify .
!pip install -q ./detoxify
!rm -r ./detoxify
!cp -r ../input/detoxify-sourcemodels/torch/hub/checkpoints /root/.cache/torch/hub
!cp -r ../input/detoxify-sourcemodels/huggingface/transformers /root/.cache/huggingface

import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import lightgbm as lgb
import xgboost as xgb

from detoxify import Detoxify

from scipy.stats import rankdata

import pickle

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def get_pretrained_detoxify_model ( model_type, device):
    detox = Detoxify(model_type= model_type, device=device)
    return detox


def detoxify_predict (model_type, texts, batch_size, device):
    model = get_pretrained_detoxify_model ( model_type, device)
    
    for i ,chunk in enumerate(chunks(texts, batch_size)):
        if i == 0:
            results = pd.DataFrame( model.predict(chunk) )
        else:
            results = results.append(pd.DataFrame( model.predict(chunk) ))  

    results.columns = [f"{model_type}_{c}" for c in results.columns ]        
            
    return results.reset_index(drop = True)

### Feature extraction

In [None]:
def tf_idf_feature (texts, feature_name, path, folds=5):
    pred = np.zeros( (len(texts), ) )
    for fold in range(folds):
        model = pickle.load(open(f"{path}/{feature_name}_{fold}.pkl", 'rb'))
        vec = pickle.load(open(f"{path}/{feature_name}_vec_{fold}.pkl", 'rb'))
        X = vec.transform ( texts )
        pred += model.predict_proba ( X )[:,1]
    
    return pred/folds

In [None]:
df_submisison = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

texts = list(df_submisison["text"].values)

# Detoxify features

original_small = detoxify_predict ("original-small", texts, device="cuda", batch_size=64 ) 
unbiased_small = detoxify_predict ("unbiased-small", texts, device="cuda", batch_size=64 ) 
original = detoxify_predict ("original", texts, device="cuda", batch_size=64 )
unbiased = detoxify_predict ("unbiased", texts, device="cuda", batch_size=64 )
multilingual = detoxify_predict ("multilingual", texts, device="cuda", batch_size=64 )

df_submisison = pd.concat ([df_submisison, original, unbiased, multilingual, original_small, unbiased_small], axis=1)

"""
# tf-idf features (jc)

features = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
path = "../input/jrsotc-jc-tf-idf-fe/"
for feat in features:
    feat_name = f"jc_tfidf_{feat}"
    print(feat_name)
    df_submisison[feat_name] = tf_idf_feature(texts, feat_name, path=path, folds=5)
"""

df_submisison.head()


### Predic

In [None]:
def predict (df, model_path, model_name, cols):
    pred = np.zeros ((df.shape[0]))
    X = df [cols].values


    for fold in range(5):
        model = pickle.load(open(f"{model_path}/{model_name}_{fold}.pkl", 'rb'))

        pred += model.predict ( X )

    
    return pred

In [None]:
cols = ['original_toxicity', 
        'original_severe_toxicity', 
        'original_identity_attack', 
        'unbiased_toxicity', 
        'unbiased_identity_attack', 
        'unbiased_insult', 
        'multilingual_toxicity', 
        'multilingual_sexual_explicit', 
        'original-small_toxicity',
        'unbiased-small_severe_toxicity']



pred = np.zeros( (df_submisison.shape[0], ))
for col in cols:
    pred += rankdata(df_submisison[col].values, method='ordinal')

df_submisison['rf_pred'] = pred

df_submisison['score'] = rankdata(pred, method='ordinal')

df_submisison[['comment_id', 'score']].to_csv("submission.csv", index=False)

df_submisison.head()