In [None]:
import gc
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import sys
import numpy as np
import pandas as pd
import pickle

!cp -r ../input/detoxify-master detoxify
!pip install -q ./detoxify
!rm -rf ./detoxify
from detoxify import Detoxify

import datasets
import pytorch_lightning as pl
import torch
import transformers
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

NUM_WORKERS = 2
GPUS = 1
BATCH_SIZE = 4

df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

class JigsawDataOriginal(Dataset):
    def __init__(self, df, train=True):
        self.data = datasets.Dataset.from_pandas(df)
        self.train = train

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        meta = {}
        entry = self.data[index]
        text = entry["comment_text"]
        if 'score' in entry:
            meta["target"] = torch.tensor([entry["score"]], dtype=torch.float)
        return text, meta

class ToxicClassifier(pl.LightningModule):
    def __init__(self, model_type):
        super().__init__()
        if model_type == 'bert-base-uncased':
            self.model = transformers.BertForSequenceClassification.from_pretrained("../input/bert-base-uncased", num_labels=1)
            self.tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-uncased")
        elif model_type == 'roberta-base':
            self.model = transformers.RobertaForSequenceClassification.from_pretrained("../input/roberta-base", num_labels=1)
            self.tokenizer = transformers.RobertaTokenizer.from_pretrained("../input/roberta-base")
        elif model_type == 'xlnet-base-cased':
            self.model = transformers.XLNetForSequenceClassification.from_pretrained("../input/xlnet-base-cased", num_labels=1)
            self.tokenizer = transformers.XLNetTokenizer.from_pretrained("../input/xlnet-base-cased", model_max_length=512)
        else:
            print('Unsupported model requested')
            sys.exit(1)
        self.preds = []

    def forward(self, x):
        inputs = self.tokenizer(x, return_tensors="pt", truncation=True, padding=True).to(self.model.device)
        outputs = self.model(**inputs)[0]
        return outputs

    def training_step(self, batch, batch_idx):
        x, meta = batch
        output = self.forward(x)
        loss = F.binary_cross_entropy_with_logits(output, meta["target"].to(output.device).float())
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        x, meta = batch
        output = self.forward(x)
        loss = F.binary_cross_entropy_with_logits(output, meta["target"].to(output.device).float())
        self.log("val_loss", loss)
        return {"loss": loss}

    def test_step(self, batch, batch_idx):
        x, meta = batch
        output = self.forward(x)
        preds = torch.sigmoid(output).cpu().detach().numpy().reshape(-1)
        self.preds += preds.tolist()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=3e-5, weight_decay=3e-6, amsgrad=True)

def load_and_predict(path, texts):
    if 'linear' in path:
        pipeline = pickle.load(open(path, 'rb'))
        return pipeline.predict(texts)
    elif 'detoxify-models' in path:
        if path == '../input/detoxify-models/toxic_original-c1212f89.ckpt':
            model_type = 'original'
            model = Detoxify(model_type, checkpoint=path, device='cuda',
                huggingface_config_path='../input/bert-base-uncased')
            model.tokenizer = transformers.AutoTokenizer.from_pretrained('../input/bert-base-uncased',
                local_files_only=True, use_fast=False)
        elif path == '../input/detoxify-models/toxic_debiased-c7548aa0.ckpt':
            model_type = 'unbiased'
            model = Detoxify(model_type, checkpoint=path, device='cuda',
                huggingface_config_path='../input/roberta-base')
            model.tokenizer = transformers.AutoTokenizer.from_pretrained('../input/roberta-base',
                local_files_only=True, use_fast=False)
        elif path == '../input/detoxify-models/multilingual_debiased-0b549669.ckpt':
            model_type = 'multilingual'
            model = Detoxify(model_type, checkpoint=path, device='cuda',
                huggingface_config_path='../input/xlm-roberta-base')
            model.tokenizer = transformers.AutoTokenizer.from_pretrained('../input/xlm-roberta-base',
                local_files_only=True)
        else:
            print('Invalid detoxify model type requested')
            sys.exit(1)
        preds = []
        for text in texts:
            res = model.predict(text)
            if model_type == 'original' or model_type == 'multilingual':
                score = res['toxicity'] + res['severe_toxicity']
                preds.append(score)
            else:
                score = res['severe_toxicity'] + res['obscene'] + res['identity_attack'] + res['insult'] + res['threat'] + res['sexual_explicit']
                preds.append(score)
        return np.array(preds)
    elif 'torch' in path:
        df = pd.DataFrame({'comment_text': texts})
        dataset = JigsawDataOriginal(df, train=False)
        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False)

        if 'bert-base-uncased' in path:
            model = ToxicClassifier('bert-base-uncased')
        elif 'roberta-base' in path:
            model = ToxicClassifier('roberta-base')
        elif 'xlnet-base-cased' in path:
            model = ToxicClassifier('xlnet-base-cased')
        else:
            print('Unsupported torch model requested')
            sys.exit(1)

        model.load_state_dict(torch.load(path))
        trainer = pl.Trainer(gpus=GPUS)

        trainer.test(model, dataloaders=dataloader)
        return np.array(model.preds)
    else:
        print("invalid model format in arguments")
        sys.exit(1)

In [None]:
# 0.7117709578849475 = (1) [0.24 0.27 0.15 0.26 0.08] model_linear_ruddit.csv model_linear_unintended.csv
# model_detoxify_unbiased.csv model_detoxify_multilingual.csv model_torch_xlnet_unbiased.csv

def get_all(path, check_val):
    print(f'getting predictions from {path}')
    val_lt = load_and_predict(path, df_val['less_toxic']) if check_val else None
    val_mt = load_and_predict(path, df_val['more_toxic']) if check_val else None
    sub = load_and_predict(path, df_sub['text'])
    return val_lt, val_mt, sub

weights = np.array([0.24, 0.27, 0.15, 0.26, 0.08])

check_val = False

p1_lt, p1_mt, p1_sub = get_all('../input/jigsaw-models-20220202/model_linear_ruddit.pkl', check_val)
print(p1_lt)
print(p1_mt)
print(p1_sub)
p2_lt, p2_mt, p2_sub = get_all('../input/jigsaw-models-20220202/model_linear_unintended.pkl', check_val)
print(p2_lt)
print(p2_mt)
print(p2_sub)
p3_lt, p3_mt, p3_sub = get_all('../input/detoxify-models/toxic_debiased-c7548aa0.ckpt', check_val)
print(p3_lt)
print(p3_mt)
print(p3_sub)
p4_lt, p4_mt, p4_sub = get_all('../input/detoxify-models/multilingual_debiased-0b549669.ckpt', check_val)
print(p4_lt)
print(p4_mt)
print(p4_sub)
p5_lt, p5_mt, p5_sub = get_all('../input/jigsaw-models-20220202/xlnet-base-cased/model_torch_xlnet_unbiased.ckpt', check_val)
print(p5_lt)
print(p5_mt)
print(p5_sub)

if check_val:
    lt = np.dot(weights, np.stack([p1_lt, p2_lt, p3_lt, p4_lt, p5_lt]))
    mt = np.dot(weights, np.stack([p1_mt, p2_mt, p3_mt, p4_mt, p5_mt]))
    print(f'Validation Accuracy is {(lt < mt).mean()}')

df_sub['score'] = np.dot(weights, np.stack([p1_sub, p2_sub, p3_sub, p4_sub, p5_sub]))
print(df_sub['score'])
df_sub['score'] = df_sub['score'].rank(method='first')
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)
print(df_sub)