# Because notebook is not allowed to have intertet, I had to add this (copied from detoxify library):

In [None]:
import torch
import transformers

MODEL_URLS = {
    "original": "https://github.com/unitaryai/detoxify/releases/download/v0.1-alpha/toxic_original-c1212f89.ckpt"
}

PRETRAINED_MODEL = None


def get_model_and_tokenizer(
    model_type, model_name, tokenizer_name, num_classes, state_dict
):
    model_class = getattr(transformers, model_name)
    model = model_class.from_pretrained(
        pretrained_model_name_or_path="../input/get-huggingface-models/bert-base-uncased/", # clone huggingface model via another online notebook
        num_labels=num_classes,
        state_dict=state_dict,
        local_files_only=True
    )
    tokenizer = getattr(transformers, tokenizer_name).from_pretrained(
        "../input/get-huggingface-models/bert-base-uncased/", # clone huggingface model via another online notebook
        local_files_only=True,
        model_max_length=512
    )

    return model, tokenizer


def load_checkpoint(model_type="original", checkpoint=None, device='cpu'):
    if checkpoint is None:
        checkpoint_path = MODEL_URLS[model_type]
        loaded = torch.hub.load_state_dict_from_url(checkpoint_path, map_location=device)
    else:
        loaded = torch.load(checkpoint)
        if "config" not in loaded or "state_dict" not in loaded:
            raise ValueError(
                "Checkpoint needs to contain the config it was trained \
                    with as well as the state dict"
            )
    class_names = loaded["config"]["dataset"]["args"]["classes"]
    # standardise class names between models
    change_names = {
        "toxic": "toxicity",
        "identity_hate": "identity_attack",
        "severe_toxic": "severe_toxicity",
    }
    class_names = [change_names.get(cl, cl) for cl in class_names]
    model, tokenizer = get_model_and_tokenizer(
        **loaded["config"]["arch"]["args"], state_dict=loaded["state_dict"]
    )

    return model, tokenizer, class_names


def load_model(model_type, checkpoint=None):
    if checkpoint is None:
        model, _, _ = load_checkpoint(model_type=model_type)
    else:
        model, _, _ = load_checkpoint(checkpoint=checkpoint)
    return model


class Detoxify:
    def __init__(self, model_type="original", checkpoint=PRETRAINED_MODEL, device="cpu"):
        super(Detoxify, self).__init__()
        self.model, self.tokenizer, self.class_names = load_checkpoint(
            model_type=model_type, checkpoint=checkpoint, device=device
        )
        self.device = device
        self.model.to(self.device)


    @torch.no_grad()
    def predict(self, text):
        self.model.eval()
        inputs = self.tokenizer(
            text, return_tensors="pt", truncation=True, padding=True
        ).to(self.model.device)
        out = self.model(**inputs)[0]
        scores = torch.sigmoid(out).cpu().detach().numpy()
        results = {}
        for i, cla in enumerate(self.class_names):
            results[cla] = (
                scores[0][i]
                if isinstance(text, str)
                else [scores[ex_i][i].tolist() for ex_i in range(len(scores))]
            )
        return results

In [None]:
# make sure that all the dependencies match to what detoxify is requiring
!pip install ../input/detoxifygithub18112021/ > /dev/null # include github repo of detoxify into the data

In [None]:
# load checkpoint of a required model
!mkdir /root/.cache/torch
!mkdir /root/.cache/torch/hub
!mkdir /root/.cache/torch/hub/checkpoints
!cp ../input/detoxifyoriginal/toxic_original-c1212f89.ckpt /root/.cache/torch/hub/checkpoints/toxic_original-c1212f89.ckpt

# Now, actual code begins

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# test
model = Detoxify('original', device='cuda')
model.predict('I hate you')['severe_toxicity']

In [None]:
comments_to_score = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
sample_submission = pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")
validation_data = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
# create new column "score" with the scores from the pre-trained model
comments_to_score['score'] = comments_to_score.text.apply(lambda x: model.predict(x)['severe_toxicity']).values

In [None]:
ax = sns.distplot(comments_to_score['score'])

In [None]:
comments_to_score[['comment_id', 'score']].to_csv("submission.csv", index=False)

# Validation

In [None]:
validation_data['score_less'] = validation_data.less_toxic.apply(lambda x: model.predict(x)['severe_toxicity']).values
validation_data['score_more'] = validation_data.more_toxic.apply(lambda x: model.predict(x)['severe_toxicity']).values

In [None]:
# Average Agreement with Annotators -- score by organizers
ranking_results = (np.array(validation_data.score_less.values) < np.array(validation_data.score_more.values))
ranking_results.mean()