In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import zipfile
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, Input, Dropout, Activation
from keras.layers import Bidirectional, LSTM, Embedding, GlobalMaxPool1D
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K
from keras import callbacks
from sklearn.model_selection import train_test_split
import transformers
from transformers import TFBertModel, BertTokenizer

## Using BERT as strong baseline
1. prepare data
2. test [CLS], Avg(last_hidden_state), Glo(last_hidden_state)
3. conclusion and future work

### Prepare data

In [None]:
# Using zipfile to extract the data
import zipfile


samplesub_zip = '../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip'
test_zip = '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
test_labels_zip = '../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip'
train_zip = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'

# read the zipfile and extract all of them
for file_dir in [samplesub_zip, test_zip, test_labels_zip, train_zip]:
    zip_ref = zipfile.ZipFile(file_dir, 'r')
    zip_ref.extractall('./jigsawtoxic/')
    zip_ref.close()

base_dir = './jigsawtoxic/'
os.listdir(base_dir)

In [None]:
# get df datatype


train = pd.read_csv(base_dir + 'train.csv')
test = pd.read_csv(base_dir + 'test.csv')
test_labels = pd.read_csv(base_dir + 'test_labels.csv')
sample_submission = pd.read_csv(base_dir + 'sample_submission.csv')

In [None]:
# See the data

train.head()

In [None]:
test.head()

In [None]:
print(train.shape, test.shape, test_labels.shape)

In [None]:
test_labels

In [None]:
# see the classification distribution

for list_columns in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(train[list_columns].value_counts())

### Model buliding


In [None]:
# 模型

class MultiLabelBert(keras.Model):
    def __init__(self, num_labels, token_used='CLS', add_tfidf=False):
        super(MultiLabelBert, self).__init__()
        self.bert = TFBertModel.from_pretrained('bert-base-uncased')
        self.classifier = keras.layers.Dense(units=num_labels, activation='sigmoid')
        self.add_tfidf = False
        if add_tfidf:
            self.add_tfidf = True
            self.concate_layer = keras.layers.Concatenate(axis=-1)
        self.token_used = token_used
        assert(self.token_used in ['CLS', 'AVG', 'GLO'])
        self.avg_pooling = keras.layers.GlobalAveragePooling1D()
        self.glo_pooling = keras.layers.GlobalMaxPool1D()
    
    def call(self, x):
        if self.token_used == 'CLS':
            bert_embedding = self.bert(x)['pooler_output']
        elif self.token_used == 'AVG':
            bert_embedding = self.avg_pooling(self.bert(x)['last_hidden_state'])
        else:
            bert_embedding = self.glo_pooling(self.bert(x)['last_hidden_state'])
        if self.add_tfidf:
            tfidf_embedding = x['tfidf']
            all_embedding = self.concate_layer([bert_embedding, tfidf_embedding])
            return self.classifier(all_embedding)
        # print(cls_token_embedding.shape)
        return self.classifier(bert_embedding)

In [None]:
def convert_example_to_feature(tokenizer, review):
    """
        透過 Tokenizer 編碼成 BERT 輸入(dict)
    """
    return tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True
    )


def map_example_to_dict(input_ids, attention_masks, token_type_ids, label, tf_idf=None):
    """
        將list of inputs 轉換成dict可以對應的BERT的輸入層
    """
    if tf_idf:
        return {
          "input_ids": input_ids,
          "token_type_ids": token_type_ids,
          "attention_mask": attention_masks,
          "tf-idf": tf_idf
        }, label
    
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
    }, label


def encode_examples(ds, limit=-1, add_tfidf=False):
    """
        透過list 包裝，之後搭配 dict轉換。
    """
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if add_tfidf:
        tfidf_list = []
        
    if (limit > 0):
        ds = ds.take(limit)    # DataFrame.take(indices, axis=0, is_copy=None, **kwargs): Return the elements in the given positional indices along an axis.
    
    for index, row in ds.iterrows():
        review = row["comment_text"]
        label = row["y"]
        if add_tfidf:
            tfidf_list.append(row["tfidf"])

        bert_input = convert_example_to_feature(tokenizer, review)
  
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append(label)
    if add_tfidf:
        return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list, tfidf_list)).map(map_example_to_dict)
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
num_labels = 6
glo_bert = MultiLabelBert(num_labels, 'GLO')

In [None]:
# 因為是多標籤，將y包打好
train['y'] = 0
train['y'] = train['y'].apply(func=lambda x: [])


    
for i in range(len(train)):
    for j in range(2, len(train.columns)-1):
        train.iloc[i, -1].append(train.iloc[i, j])

train.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

print(train.shape)
train = shuffle(train)

nums =  int(len(train) * 0.9)

# 90% train data
val_set = train[nums:][['comment_text', 'y']]
train_set = train[:nums][['comment_text', 'y']]

In [None]:
batch_size = 64

# train dataset
ds_train_encoded = encode_examples(train_set).batch(batch_size)
# val dataset
ds_val_encoded = encode_examples(val_set).batch(batch_size)

In [None]:
def convert_sentence_to_features(tokenizer, text):
    return tokenizer.encode_plus(
        text,
        max_length=128,
        add_special_tokens=True,
        pad_to_max_length=True,
        return_attention_mask=True
    )


def map_datarow_to_dict(input_ids, token_type_ids, attention_mask, tfidf=None):
    if tfidf:
        return {
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_mask,
            "tfidf": tfidf
        }
    return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_mask,
    }


def encode_examples_test(ds, limit=-1, add_tfidf=False):
    input_ids_list, attention_mask_list, token_type_ids_list, label_list = [], [], [], []
    if add_tfidf:
        tfidf_list = []
    if(limit > 0):
        ds = ds.take(limit)
    for idx, row in ds.iterrows():
        text = row['comment_text']
        if add_tfidf:
            tfidf_list.append(row['tfidf'])
        bert_input = convert_sentence_to_features(tokenizer, text)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])        
        attention_mask_list.append(bert_input['attention_mask'])
    if add_tfidf:
        return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, tfidf_list)).map(map_datarow_to_dict)
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list)).map(map_datarow_to_dict)

In [None]:
batch_size = 64
ds_test_encoded = encode_examples_test(test).batch(batch_size)

In [None]:
for x in ds_val_encoded.take(1):
    print(x)

In [None]:
num_labels = 6
learning_rate = 2e-5

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,epsilon=1e-08, clipnorm=1)
glo_bert.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
glo_bert.fit(ds_train_encoded, validation_data=ds_val_encoded, epochs=3)

In [None]:
num_labels = 6
learning_rate = 2e-5

avg_bert = MultiLabelBert(num_labels, 'AVG')

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,epsilon=1e-08, clipnorm=1)
avg_bert.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
avg_bert.fit(ds_train_encoded, validation_data=ds_val_encoded, epochs=3)

In [None]:
num_labels = 6
learning_rate = 2e-5

cls_bert = MultiLabelBert(num_labels, 'CLS')

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,epsilon=1e-08, clipnorm=1)
cls_bert.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
cls_bert.fit(ds_train_encoded, validation_data=ds_val_encoded, epochs=3)

In [None]:
y_test = cls_bert.predict(ds_test_encoded, batch_size=batch_size)
sample_submission[data_labels] = y_test
sample_submission.to_csv('cls_bert_submission.csv', index=False)
sample_submission

In [None]:
y_test = avg_bert.predict(ds_test_encoded, batch_size=batch_size)
sample_submission[data_labels] = y_test
sample_submission.to_csv('avg_bert_submission.csv', index=False)
sample_submission

In [None]:
y_test = glo_bert.predict(ds_test_encoded, batch_size=batch_size)
sample_submission[data_labels] = y_test
sample_submission.to_csv('glo_bert_submission.csv', index=False)
sample_submission

## Conclusion
- It's a interesting experiment to compare with [CLS]、Average [last_hidden_state]、Global [last_hidden_state].
- The strong start method, but it needs lots of computation power to train a model(compared to simple CNN、RNN model).
- Hope this work would help others to start a transformer-based model to start the competition.

---

## Future work
- data part
    - data preprocessing
        - There are lots of noise in comment, like ==, @@.
    - max_length:
        - The tokenizer's max_length is a parameter needed to adjust because it is possible to drop necessary tokens.
    - split method:
        - The current split method is simple, and the data class labels are obviously unbalanced.
    - data augmentation:
        - We could use data augmentation to let model more robust. There are lots of method in NLP, like wordnet(probability not suitable in this case), [pre-trained NLP aug model](https://arxiv.org/abs/2003.02245)
- model part
    - BERT
        - BERT is powerful, but nowadays there are lots of model based on BERT to go further in some features like speed( [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert) ), hyperparameter ( [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)), or other language model ( [XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet) )
        - optimizer: The optimizer, learning rate, and other hyperparameters are all adjustable, but the model is too big to run several experiments.
        