# Preface

# Import 
## Import Library
First, import library needed to solve the problem

In [None]:
import os
import zipfile
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, Input, Dropout, Activation
from keras.layers import Bidirectional, LSTM, Embedding, GlobalMaxPool1D
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K
from keras import callbacks
from sklearn.model_selection import train_test_split

## Import Dataset

In [None]:
samplesub_zip = '../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip'
test_zip = '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
test_labels_zip = '../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip'
train_zip = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'

for file_dir in [samplesub_zip, test_zip, test_labels_zip, train_zip]:
  zip_ref = zipfile.ZipFile(file_dir, 'r')
  zip_ref.extractall('./jigsawtoxic/')
  zip_ref.close()

base_dir = './jigsawtoxic/'
os.listdir('./jigsawtoxic/')

# EDA (Exploratory Data Analysis)
We take a look inside of the dataset, train set and test set, also what the submission be like

In [None]:
train = pd.read_csv('./jigsawtoxic/train.csv')
train

In [None]:
test = pd.read_csv('./jigsawtoxic/test.csv')
test

In [None]:
test_labels = pd.read_csv('./jigsawtoxic/test_labels.csv')
test_labels

In [None]:
sample_submission = pd.read_csv('./jigsawtoxic/sample_submission.csv')
sample_submission

In [None]:
for list_columns in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(train[list_columns].value_counts())

In [None]:
for list_columns in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(test_labels[list_columns].value_counts())

from dataset page description value of -1 indicates it was not used for scoring; (Note: file added after competition close and this notebook was made after the competition over)

Drop column that are not needed for training

In [None]:
train = train.drop(columns=["id"])

In [None]:
print(train.isnull().any(), "\n")
print(test_labels.isnull().any())

*Note: if false, then there are no null value in dataset

# Tokenization

In [None]:
data_train = train['comment_text']
data_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_train = train[data_labels].values
data_test = test['comment_text']

In [None]:
embed_size = 50             # how big is each word vector
max_features = 20000        # how many unique words to use (i.e num rows in embedding vector)
maxlen = 200                # max number of words in a comment to use

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(data_train))

list_tokenized_train = tokenizer.texts_to_sequences(data_train)
list_tokenized_test = tokenizer.texts_to_sequences(data_test)

X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

We're gonna use embedding from other pre-trained word vector dataset to make it faster and not training Jigsaw Toxic Commment from beginning.

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.

In this notebook, we're gonna use version 6B50d

In [None]:
glove_embedding = '../input/glove6b50dtxt/glove.6B.50d.txt'

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(glove_embedding))

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
emb_mean, emb_std

In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
total_num_words = [len(one_comment) for one_comment in list_tokenized_train]

In [None]:
plt.hist(total_num_words, bins = np.arange(0, 500, 10))
plt.show()

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_features, output_dim=embed_size, 
                              input_length=maxlen, weights=[embedding_matrix]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(60, return_sequences=True, 
                                                       dropout=0.1, recurrent_dropout=0.1)),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(6, activation='sigmoid')])    # only 2 value in each labels

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# using binary crossentropy because each labels or the features only have 2 value, 0 or 1

model.summary()

Using callbacks.EarlyStopping with val_loss as monitor to make training stopped if validation loss not increasing, and the training will stop since after 5 epoch

In [None]:
early_stopping_cb = callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
batch_size = 2048
epochs = 100

history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, 
                    validation_split=0.2,     # validation set is 20% of dataset
                    callbacks=[early_stopping_cb], verbose=1)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train', 'test'])
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'])
plt.show()

make prediction using trained model for test set and submit the submission

In [None]:
y_test = model.predict([X_test], batch_size=512, verbose=1) 

In [None]:
sample_submission[data_labels] = y_test
sample_submission.to_csv('submission.csv', index=False)
sample_submission

# Reference:
* [GloVe Web](https://nlp.stanford.edu/projects/glove/)