# [Training Notebook : https://www.kaggle.com/adldotori/huggingface-distilbertclassification-starter](https://www.kaggle.com/adldotori/huggingface-distilbertclassification-starter)

# Load

## Tokenizer

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('/kaggle/input/distilberttokenizerfast-tokenizer/')

## Model

In [None]:
import torch
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MODEL_PATH = '/kaggle/input/huggingface-distilbertclassification-starter/checkpoints/'
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH).to(device)

# Validation

In [None]:
import pandas as pd
import os.path as osp
from tqdm import tqdm

In [None]:
INPUT_PATH = '/kaggle/input/jigsaw-toxic-severity-rating/'

sample_submission = pd.read_csv(osp.join(INPUT_PATH, 'sample_submission.csv'))
validation_data = pd.read_csv(osp.join(INPUT_PATH, 'validation_data.csv'))
comments_to_score = pd.read_csv(osp.join(INPUT_PATH, 'comments_to_score.csv'))
comments_to_score['score'] = 0
comments_to_score.head()

In [None]:
validation_data

In [None]:
validation_data['correct'] = 0

for i in tqdm(range(len(validation_data))):
    input = tokenizer.encode(validation_data.iloc[i]['less_toxic'], return_tensors="pt").to(device)
    output = model(input[:, :512])[0]
    prediction_less = torch.softmax(output, dim=1)[0][1].item()
    
    input = tokenizer.encode(validation_data.iloc[i]['more_toxic'], return_tensors="pt").to(device)
    output = model(input[:, :512])[0]
    prediction_more = torch.softmax(output, dim=1)[0][1].item()
    validation_data.loc[i, 'correct'] = 1 if prediction_more > prediction_less else 0

In [None]:
print('Valid Score: ', round(validation_data['correct'].mean(), 3))

# Inference

In [None]:
x = 0
for i in tqdm(range(len(comments_to_score))):
    input = tokenizer.encode(comments_to_score.iloc[i]['text'], return_tensors="pt").to(device)
    output = model(input[:, :512])[0]
    predictions = torch.softmax(output, dim=1)
    comments_to_score.loc[i, 'score'] = predictions[0][1].item()

In [None]:
comments_to_score['score'] = comments_to_score['score'].rank(method='first')

In [None]:
comments_to_score.sort_values('score')

Upside is nontoxic text, downside is toxic text.

In [None]:
sample_submission['score'] = comments_to_score.sort_values('comment_id')['score']

In [None]:
sample_submission

In [None]:
sample_submission.to_csv('submission.csv', index=False)