In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = '../input/huggingface-toxic-bert/toxic-bert'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)

In [None]:
comments_to_score = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
comments_to_score

In [None]:
from datasets import Dataset

ds = Dataset.from_pandas(comments_to_score)
ds

In [None]:
def tokenize(sequences):
    return tokenizer(sequences['text'], padding=True,truncation=True)

In [None]:
ds = ds.map(tokenize, batched = True)
ds.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids"])
ds

In [None]:
def predict(batch):
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    with torch.no_grad():
        output = model(**inputs)['logits'].sigmoid()
    return {'proba': output.cpu().detach().numpy()}

In [None]:
ds = ds.map(predict, batched = True, batch_size = 8)

In [None]:
id2label = model.config.id2label
id2label

In [None]:
#copy from: https://www.kaggle.com/vitaleey/tfidf-ridge
label2score = {
    'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
    'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5
}

In [None]:
weigths = torch.tensor([[label2score[v] for k, v in id2label.items()]])
weigths


In [None]:
score = (ds['proba'] * weigths).sum(dim = -1)

In [None]:
submission = pd.DataFrame({
    'comment_id': ds['comment_id'],
    'score': score
})
submission.to_csv('submission.csv', index = False)
submission