In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
train=pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
valid=pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
score=pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")
train.head()

In [None]:
valid.head()

In [None]:
train.info()

In [None]:
print("Size: ", train.size)
print("Shape: ",train.shape)
train.describe()

In [None]:
less_toxic = pd.DataFrame()
less_toxic['text'] = valid['less_toxic'].tolist()
less_toxic['label'] = "Less Toxic"

more_toxic = pd.DataFrame()
more_toxic['text'] = valid['more_toxic'].tolist()
more_toxic['label'] = "More Toxic"

toxic = pd.concat([less_toxic, more_toxic], ignore_index=True)
toxic.head(15)

In [None]:
more_toxic.head(15)

In [None]:
print(toxic['label'].count())
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(18,5)) 
sns.countplot(x='label', data=toxic)

In [None]:
class Dataset:
    def __init__(self, text, tokenizer, max_len):
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
        }

In [None]:
def generate_predictions(model_path, max_len):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    
    dataset = Dataset(text=df.text.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False
    )

    final_output = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            output = output.logits.detach().cpu().numpy()[:, 1].ravel().tolist()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.array(final_output)

In [None]:
preds = generate_predictions("../input/autonlp-toxic-1/", max_len=192)

In [None]:
sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
sub["score"] = preds
sub = sub[["comment_id", "score"]]
sub.to_csv("submission.csv", index=False)

In [None]:
sub.head()