---
## [Jigsaw Rate Severity of Toxic Comments][1]
---
**Comments1**: 'Internet' is required for this Notebook.

**Comments2**: Thanks to previous great Notebooks.

1. [☣️ Jigsaw - Incredibly Simple Naive Bayes [0.768]][2]
2. [AutoNLP for toxic ratings ;)][3]


[1]: https://www.kaggle.com/c/jigsaw-toxic-severity-rating/overview
[2]: https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768
[3]: https://www.kaggle.com/abhishek/autonlp-for-toxic-ratings

# 0. Settings

In [None]:
# Import dependencies 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

import os
import pathlib
import gc
import sys
import math 
import time 
import tqdm 
from tqdm import tqdm 
import random

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold 
from sklearn.model_selection import StratifiedKFold 

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers.experimental import preprocessing

import transformers 
import datasets 

In [None]:
# global config
config = {
    'nfolds': 10,
    'batch_size': 32,
    'learning_rate': 1e-4,
    'num_epochs': 3,
    'batch_size': 8,
}

AUTOTUNE = tf.data.experimental.AUTOTUNE

# For reproducible results    
def seed_all(s):
    random.seed(s)
    np.random.seed(s)
    tf.random.set_seed(s)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    os.environ['PYTHONHASHSEED'] = str(s) 
global_seed = 42
seed_all(global_seed)

# 1. Data Preprocessing

### 1. Create train data

For training data, I used [Toxic Comment Classification Challenge][1] dataset.

[1]: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data

I turn it into a binary toxic/ no-toxic classification

In [None]:
df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.head(5)

### 1.2 Undersampling

The dataset is very unbalanced. Here we undersample the majority class. Other strategies might work better.

In [None]:
df['y'].value_counts(normalize=True)

In [None]:
min_len = (df['y'] == 1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=global_seed)
train_df = pd.concat([df[df['y'] == 1], df_y0_undersample]).reset_index(drop=True)
train_df['y'].value_counts()

In [None]:
train_df.head()

### 1.3 k-fold

In [None]:
n_folds = 10

skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=global_seed)
for nfold, (train_index, val_index) in enumerate(skf.split(X=train_df.index,
                                                           y=train_df.y)):
    train_df.loc[val_index, 'fold'] = nfold
print(train_df.groupby(['fold', train_df.y]).size())

In [None]:
p_fold = 0
p_train = train_df.query(f'fold != {p_fold}').reset_index(drop=True)
p_valid = train_df.query(f'fold == {p_fold}').reset_index(drop=True)

print(len(p_train))
print(len(p_valid))

p_train.head()

# 2. DataSet

In [None]:
checkpoint = "bert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)

In [None]:
train_ds = datasets.Dataset.from_pandas(p_train)
valid_ds = datasets.Dataset.from_pandas(p_valid)

print(train_ds)
print(valid_ds)

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_valid_ds = valid_ds.map(tokenize_function, batched=True)

print(tokenized_train_ds)
print(tokenized_valid_ds)

In [None]:
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)

tf_train_ds = tokenized_train_ds.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["y"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=config['batch_size'],
)

tf_valid_ds = tokenized_valid_ds.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["y"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=config['batch_size'],
)

print(len(tf_train_ds))
print(len(tf_valid_ds))

# 3. Model Training

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
num_epochs = 2
num_train_steps = len(tf_train_ds) * num_epochs

lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.summary()

In [None]:
fit_history = model.fit(tf_train_ds,
                        epochs=num_epochs,
                        validation_data=tf_valid_ds,
                        verbose=1)

# 4. Prediction & Submit

In [None]:
test_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
test_ds = datasets.Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(tokenize_function, batched=True)
tf_test_ds = tokenized_test_ds.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=config['batch_size'],
)

In [None]:
raw_result = model.predict(tf_test_ds)
result = tf.sigmoid(raw_result.logits)

test_df['score'] = result.numpy()[:, 0]
submission_df = test_df[['comment_id', 'score']]

submission_df.to_csv("submission.csv", index=False)
submission_df