# ☣️ Jigsaw - HuggingFace Hub Baselines

In this notebook I will explore, without fine-tuning, various models from the huggingface hub.
I am bringing them to kaggle as datasets in the process.

There are various that were already trained for other Jigsaw competitions. 



|Version | Model | Validation (first 5000 samples) | LB |
|---| ---   | ---: | --- |
|V1 | [toxic-bert](https://www.kaggle.com/julian3833/toxic-bert) | `0.71` | __ |


# Imports

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
MODEL_PATH = "../input/toxic-bert"
MAX_LENGTH = 192

DO_VALIDATE = False
VALIDATION_SIZE = 5000

# Test and Validation Dataset

In [None]:
class Dataset:
    """
    For comments_to_score.csv (the submission), get only one comment per row
    """
    def __init__(self, text, tokenizer, max_len):
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long)
        }
    
    
class ValidationDataset:
    """
    Goes through validation_data.csv, Loading and tokenizing both less_toxic and more_toxic
    
    Inspired by: https://www.kaggle.com/debarshichanda/pytorch-w-b-jigsaw-starter
    """
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def tokenize(self, text):
        return self.tokenizer(text, max_length=self.max_len, 
                              padding="max_length", truncation=True)
    
    def __getitem__(self, i):
        more_toxic = self.df['more_toxic'].iloc[i]
        less_toxic = self.df['less_toxic'].iloc[i]
        
        less_inputs = self.tokenize(less_toxic)
        more_inputs = self.tokenize(more_toxic)

        return {
            "less_input_ids": torch.tensor(less_inputs["input_ids"], dtype=torch.long),
            "less_attention_mask": torch.tensor(less_inputs["attention_mask"], dtype=torch.long),
            "more_input_ids": torch.tensor(more_inputs["input_ids"], dtype=torch.long),
            "more_attention_mask": torch.tensor(more_inputs["attention_mask"], dtype=torch.long),
        }

# Validation


In [None]:
def validate(model_path, max_len):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv", nrows=VALIDATION_SIZE)
    
    dataset = ValidationDataset(df=df, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=16, num_workers=4, pin_memory=True, shuffle=False
    )

    n_samples = len(dataset)
    hits = 0
    
    for data in data_loader:
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            less_output = model(input_ids=data['less_input_ids'], 
                                attention_mask=data['less_attention_mask'])
            
            more_output = model(input_ids=data['more_input_ids'], 
                                attention_mask=data['more_attention_mask'])
            
            # Sum the logits of the 6 toxic labels
            less_score = less_output.logits.sum(dim=1)
            more_score = more_output.logits.sum(dim=1)
            
            hits += (less_score < more_score).sum().item()
    
    
    accuracy = hits / n_samples
    print(f"Validation Accuracy: {accuracy:4.2f}")
    
    torch.cuda.empty_cache()
    return accuracy

In [None]:
if DO_VALIDATE:
    validate(MODEL_PATH, max_len=MAX_LENGTH)

# Prediction

Adapted from [AutoNLP for toxic ratings ;)](https://www.kaggle.com/abhishek/autonlp-for-toxic-ratings) by Abhishek.

In [None]:
def generate_predictions(model_path, max_len):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    
    dataset = Dataset(text=df.text.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=16, num_workers=4, pin_memory=True, shuffle=False
    )

    final_output = []

    for data in data_loader:
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            output = output.logits.sum(dim=1).detach().cpu().numpy().tolist()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.array(final_output)

# Submit

In [None]:
preds = generate_predictions(MODEL_PATH, max_len=MAX_LENGTH)

sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
sub["score"] = preds
sub = sub[["comment_id", "score"]]
sub.to_csv("submission.csv", index=False)
sub.head()

# Validation