In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q transformers

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, Dataset

from sklearn.metrics import roc_auc_score

import re


from tqdm.notebook import tqdm

from typing import *
import string

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [None]:
from transformers import DistilBertTokenizer, AdamW
from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenceClassification

In [None]:
os.getcwd()

In [None]:
os.listdir()

# Constants

In [None]:
SEED = 42
EPOCHS = 2
SEQ_SIZE = 150
BATCH_SIZE = 32
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"

# Load data

In [None]:
src = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
ss = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
test_labels = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
test_src = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

In [None]:
src.head(3)

In [None]:
ss.head(3)

In [None]:
test_labels.head(3)

In [None]:
test_src.head(3)

In [None]:
src.drop(columns='id', inplace=True)
y_label = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Preprocessing

In [None]:
def cleanString(comment: str) -> str:
    comment = re.sub('n\'t', ' not', comment)
    comment = re.sub('\'m', ' am', comment)
    comment = re.sub('\'ve', ' have', comment)
    # comment = re.sub(' to', '', comment)
    # comment = re.sub('the', '', comment)
    comment = re.sub('\'s', ' is', comment)

    comment = comment.replace('\n', ' \n ')
    comment = comment.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    comment = comment.replace(r'[0-9]', '')
    # remove punctuation and numbers
    # comment = re.sub('[^a-zA-Z!?%]', ' ', comment)
    comment = re.sub('[^a-zA-Z%]', ' ', comment)
    # del %
    comment = re.sub('%', '', comment)
    # remove multiple spaces
    comment = re.sub(r' +', ' ', comment)
    # remove newline
    comment = re.sub(r'\n', ' ', comment)
    # remove digits
    # comment = ''.join(i for i in comment if not i.isdigit())
    comment = re.sub(r' +', ' ', comment)
    comment = comment.strip()
    return comment

In [None]:
src.comment_text = src.comment_text.map(cleanString)

In [None]:
src.sample(5, random_state=SEED)

# Data exploration

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
print((tokenizer.pad_token, tokenizer.pad_token_id), (tokenizer.sep_token, tokenizer.sep_token_id), 
      (tokenizer.cls_token, tokenizer.cls_token_id), (tokenizer.unk_token, tokenizer.unk_token_id))

In [None]:
token_lens = []

for txt in tqdm(src.comment_text):
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count');
plt.show()

# Train test split

In [None]:
df_train, df_test = train_test_split(src, test_size=0.15, random_state=SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)

print('train shape:', df_train.shape)
print('valid shape:', df_val.shape)
print('test shape: ', df_test.shape)

# Dataset class

In [None]:
class CommentDataset(Dataset):

    def __init__(self, comments, targets, tokenizer, max_len):
        assert len(comments) == len(targets)
        self.comments = comments
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, item):
        comment = str(self.comments[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(comment,
                                              add_special_tokens=True,
                                              max_length=self.max_len,
                                              return_token_type_ids=False,
                                              pad_to_max_length=True,
                                            #   padding='max_length',
                                              return_attention_mask=True,
                                              return_tensors='pt',
                                             )
        return {'review_text': comment,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'targets': torch.tensor(target, dtype=torch.long)}

In [None]:
def create_data_loader(df: pd.DataFrame, tokenizer, max_len: int, batch_size: int):
    ds = CommentDataset(comments=df.comment_text.to_numpy(),
                        targets=df[y_label].to_numpy(),
                        tokenizer=tokenizer,
                        max_len=max_len)

    return DataLoader(ds, batch_size=batch_size)

# Train

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

In [None]:
set_seed(SEED)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))

In [None]:
config = DistilBertConfig.from_pretrained(PRE_TRAINED_MODEL_NAME)
config.num_labels = len(y_label)
config.problem_type = "multi_label_classification"
config.classifier_dropout = 0.2
config.return_dict = True

In [None]:
model = DistilBertForSequenceClassification(config)
model = model.to(device)

In [None]:
train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)

In [None]:
def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
    """
    hf = huggingface.
    """
    model.train()

    for batch in tqdm(data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].float().to(device)
        
        optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

In [None]:
def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
    model.eval()
    losses = []
    score = None

    for idx, batch in enumerate(tqdm(data_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].float().to(device)
        with torch.set_grad_enabled(False):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            if idx == 0:
                score =  outputs.logits.cpu()
            else:
                score = torch.cat((score, outputs.logits.cpu()))
            losses.append(outputs.loss.item())
    return score, np.mean(losses)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
# best_val_loss = 9999.
# print('====START TRAINING====')
# for epoch in tqdm(range(EPOCHS)):
#     print('-' * 10)
#     train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
#     _, tr_loss = evaluate_for_hf(model=model, data_loader=train_dataloader, device=device)
#     val_pred, val_loss = evaluate_for_hf(model=model, data_loader=val_dataloader, device=device)
#     y_pred_np = val_pred.numpy()
#     val_auc = roc_auc_score(df_val[y_label].to_numpy(), y_pred_np)
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         torch.save(model.state_dict(), 'distill_bert.pt')
#     print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4},', f'val loss: {val_loss:.4},', f'val auc: {val_auc:.4}')

In [None]:
# ====START TRAINING====
# 2/2 [1:16:49<00:00, 2305.56s/it]
# ----------
# Epoch 1/2 train loss: 0.04469, val loss: 0.04791, val auc: 0.979
# ----------
# Epoch 2/2 train loss: 0.03882, val loss: 0.04586, val auc: 0.9838

# Testing

In [None]:
model = DistilBertForSequenceClassification(config)
model.load_state_dict(torch.load('../input/jigsav-distill-bert/distill_bert.pt'))
model = model.to(device)

In [None]:
test_pred, test_loss = evaluate_for_hf(model=model, data_loader=test_dataloader, device=device)
print('====TEST RESULT====')
print(f'Log loss: {test_loss:.5}')
y_pred_np = test_pred.numpy()
test_auc = roc_auc_score(df_test[y_label].to_numpy(), y_pred_np)
print(f'ROC AUC: {test_auc:.5}')

In [None]:
# ====TEST RESULT====
# Log loss: 0.045231
# ROC AUC: 0.98233

# Final Training

In [None]:
final_model = DistilBertForSequenceClassification(config)
model.load_state_dict(torch.load('../input/jigsav-final-distill-bert/final_distill_bert.pt'))
final_model = model.to(device)

In [None]:
# src_dataloader = create_data_loader(df=src, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)

In [None]:
# optimizer = AdamW(model.parameters(), lr=2e-5)
# print('====START TRAINING====')
# for epoch in tqdm(range(EPOCHS)):
#     print('-' * 10)
#     train_epoch_for_hf(model=final_model, data_loader=src_dataloader, optimizer=optimizer, device=device)
#     _, tr_loss = evaluate_for_hf(model=final_model, data_loader=src_dataloader, device=device)
#     torch.save(model.state_dict(), 'final_distill_bert.pt')
#     print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4}')

In [None]:
# ====START TRAINING====
# 2/2 [1:25:17<00:00, 2557.58s/it]
# ----------
# Epoch 1/2 train loss: 0.03488
# ----------
# Epoch 2/2 train loss: 0.03053

# Predict and save

In [None]:
test_src_id = test_src.iloc[:, 0]
test_src.drop(columns='id', inplace=True)
test_labels.drop(columns='id', inplace=True)
test_src = pd.concat((test_src, test_labels), axis=1)

In [None]:
test_src_dataloader = create_data_loader(df=test_src, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
prediction, _ = evaluate_for_hf(model=final_model, data_loader=test_src_dataloader, device=device)
prediction = torch.sigmoid(prediction).numpy()

In [None]:
ss[y_label] = prediction

In [None]:
ss.head()

In [None]:
ss.to_csv('submission.csv', index=False)