# ☣️ Jigsaw - HuggingFace Hub Baselines

In this notebook I will explore, without fine-tuning, various models from the huggingface hub.

I am bringing them to kaggle as datasets:

* [toxic-bert](https://www.kaggle.com/julian3833/toxic-bert)
* [roberta-base-toxicity](https://www.kaggle.com/julian3833/roberta-base-toxicity)
* [roberta-toxicity-classifier](https://www.kaggle.com/julian3833/roberta-toxicity-classifier)





|Version | Model | Validation (first 5000 samples) | LB |
|---| ---   | ---: | --- |
|[V1](https://www.kaggle.com/julian3833/jigsaw-huggingface-hub-baselines?scriptVersionId=79545636) | [toxic-bert](https://www.kaggle.com/julian3833/toxic-bert) | `0.71` | `0.758` |
|[V2](https://www.kaggle.com/julian3833/jigsaw-huggingface-hub-baselines?scriptVersionId=79547125) | [roberta-base-toxicity](https://www.kaggle.com/julian3833/roberta-base-toxicity) | `0.66` | `0.751` |
|[V3](https://www.kaggle.com/julian3833/jigsaw-huggingface-hub-baselines?scriptVersionId=79547879) | [roberta-toxicity-classifier](https://www.kaggle.com/julian3833/roberta-toxicity-classifier) | `0.69` | `0.768` |
|[V4](https://www.kaggle.com/julian3833/jigsaw-huggingface-hub-baselines) | Ensemble of the previous 3 | `--` | `0.782` |


# Please, _DO_ upvote if you find this useful or interesting!

# Imports

In [None]:
import os; os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Test and Validation Dataset

In [None]:
class Dataset:
    """
    For comments_to_score.csv (the submission), get only one comment per row
    """
    def __init__(self, text, tokenizer, max_len):
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long)
        }
    
    
class ValidationDataset:
    """
    Goes through validation_data.csv, Loading and tokenizing both less_toxic and more_toxic
    
    Inspired by: https://www.kaggle.com/debarshichanda/pytorch-w-b-jigsaw-starter
    """
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def tokenize(self, text):
        return self.tokenizer(text, max_length=self.max_len, 
                              padding="max_length", truncation=True)
    
    def __getitem__(self, i):
        more_toxic = self.df['more_toxic'].iloc[i]
        less_toxic = self.df['less_toxic'].iloc[i]
        
        less_inputs = self.tokenize(less_toxic)
        more_inputs = self.tokenize(more_toxic)

        return {
            "less_input_ids": torch.tensor(less_inputs["input_ids"], dtype=torch.long),
            "less_attention_mask": torch.tensor(less_inputs["attention_mask"], dtype=torch.long),
            "more_input_ids": torch.tensor(more_inputs["input_ids"], dtype=torch.long),
            "more_attention_mask": torch.tensor(more_inputs["attention_mask"], dtype=torch.long),
        }

# Validation


In [None]:
def validate(model_path, max_len, is_multioutput):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv", nrows=VALIDATION_SIZE)
    
    dataset = ValidationDataset(df=df, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=64, num_workers=2, pin_memory=True, shuffle=False
    )

    n_samples = len(dataset)
    hits = 0
    
    for data in data_loader:
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
                
            less_output = model(input_ids=data['less_input_ids'], 
                                attention_mask=data['less_attention_mask'])
            
            more_output = model(input_ids=data['more_input_ids'], 
                                attention_mask=data['more_attention_mask'])
            
            if is_multioutput:
                # Sum the logits of the 6 toxic labels
                less_score = less_output.logits.sum(dim=1)
                more_score = more_output.logits.sum(dim=1)
                hits += (less_score < more_score).sum().item()
            else:
                less_score = less_output.logits[:, 1]
                more_score = more_output.logits[:, 1]
                hits += (less_score < more_score).sum().item()
            
            
    
    
    accuracy = hits / n_samples
    print(f"Validation Accuracy: {accuracy:4.2f}")
    
    torch.cuda.empty_cache()
    return accuracy

In [None]:
# validate(model_chk, max_length, is_multioutput)

# Prediction

Adapted from [AutoNLP for toxic ratings ;)](https://www.kaggle.com/abhishek/autonlp-for-toxic-ratings) by Abhishek.

In [None]:
def generate_predictions(model_path, max_len, is_multioutput):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    
    dataset = Dataset(text=df.text.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False
    )

    final_output = []

    for data in data_loader:
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            
            if is_multioutput:
                # Sum the logits for all the toxic labels
                # One strategy out of various possible
                output = output.logits.sum(dim=1)
            else:
                # Classifier. Get logits for "toxic"
                output = output.logits[:, 1]
            
            output = output.detach().cpu().numpy().tolist()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.array(final_output)

In [None]:
preds1 = generate_predictions("../input/toxic-bert", max_len=192, is_multioutput=True)
preds2 = generate_predictions("../input/roberta-base-toxicity", max_len=192, is_multioutput=False)
preds3 = generate_predictions("../input/roberta-toxicity-classifier", max_len=192, is_multioutput=False)

# Ensemble

Linear ensemble of the three models.


Since their scales are off I first MinMaxScale the results (per model), and then I sum the scores

In [None]:
from sklearn.preprocessing import MinMaxScaler

df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_sub["score_bert"] = preds1
df_sub["score_rob1"] = preds2
df_sub["score_rob2"] = preds3

df_sub[["score_bert", "score_rob1", "score_rob2"]] = MinMaxScaler().fit_transform(df_sub[["score_bert", "score_rob1", "score_rob2"]])

df_sub["score"] = df_sub[["score_bert", "score_rob1", "score_rob2"]].sum(axis=1)
df_sub.head()

# View some results

In [None]:
pd.set_option("display.max_colwidth", 500)

In [None]:
df_sub.sort_values("score").head(3)[['score', 'text']]

In [None]:
df_sub.sort_values("score").tail(3)[['score', 'text']]


# Submit

In [None]:
 # Tie-break, if any
df_sub['score'] = df_sub['score'].rank(method='first')

df_sub = df_sub[["comment_id", "score"]]
df_sub.to_csv("submission.csv", index=False)
df_sub.head()

# Please, _DO_ upvote if you find this useful or interesting!