In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #visualization

from keras.preprocessing.text import Tokenizer # tokenize
from keras.preprocessing.sequence import pad_sequences # padding
from keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding # model layers 
from keras.models import Model # model
from sklearn.metrics import roc_auc_score # model evaluation score metrics


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#in this notebook, pretrained GloVe is used as the word vector for the model

#word vector link
word_vec_link = 'http://nlp.standford.edu/data/glove.6B.zip' #specify link for the GloVe
!wget http://nlp.stanford.edu/data/glove.6B.zip #download the word vector
!unzip glove*.zip #unzip the word vector

#unzip dataset
!unzip ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip

In [None]:
#set default config
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE  = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 10

In [None]:
#load glove vector

print('loading word vectors...')
word2vec = {}
with open(os.path.join('./glove.6B.%sd.txt' %EMBEDDING_DIM)) as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype = 'float32')
        word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))


In [None]:
train = pd.read_csv('./train.csv') #load dataset
sentences = train['comment_text'].fillna('DUMMY_VALUE').values #take only the comment part and fill the missing value
possible_labels = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate'] #identify labels column
targets = train[possible_labels].values #take out the output label from the dataset from the label column 

print('max_sequence_length:', max(len(s) for s in sentences))

In [None]:
#convert sentence to integer

tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE) #specify the number of word that need to be used
tokenizer.fit_on_texts(sentences) #create a dictionary of size max of words arranged by most recurrence word exist in the dataset 
sequences = tokenizer.texts_to_sequences(sentences) #tokenize the dataset('sentences') and convert each word based on created dictionary


In [None]:
#sample output
print('actual sentence:')
print(sentences[0])
print('')
print('after convert using tokenizer:')
print(sequences[0])

In [None]:
#check the total number of words exist in the tokenizer 
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

#dont get confused with the specified 'num_words' when creating the tokenizer before because: 
#1) tokenizer made all word index for all unique word exist in the dataset
#2) the 'num_words' specify before is to specify on how many words that want to use from all word index created in the tokenizer

In [None]:
#pad sequences to make all the sentences to have same length
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)
print('Shape of data tensor: ', data.shape)

In [None]:
#sample shape padding
print(data[0])

In [None]:
#prepare embedding matrix and replace the each word in the tokenizer according to the weight specify from the word embedding
#note that the output may be bigger from the word because the word in the word embedding have a bigger shape

print('Filling pre_trained embedding....')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) +1) #specify the num_words
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) #create embedding matrix np array with shape (num_words, embedding dimenstion) '20 000 x 100'
for word, i in word2idx.items(): #fill the embedding matrix with the pretrained glove
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
print('Done!!')

In [None]:
#sample output shape for word 'the'

print('word "the"')
print()
print('"the" in transform tokenizer')
print(word2idx['the'])
print()
print('"the" in transform GloVe vector')
print(word2vec.get('the'))

In [None]:
#specify the Embedding layer

embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights = [embedding_matrix],
    input_length = MAX_SEQUENCE_LENGTH,
    trainable =  False
)

In [None]:
#build the model
print('Building model...')

#train a 1D convnet with global maxpooling

input_ = Input(shape = (MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = Conv1D(128,3,activation = 'relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128,3,activation = 'relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128,3,activation = 'relu')(x)
x = GlobalMaxPooling1D()(x)
output = Dense(len(possible_labels), activation = 'sigmoid')(x)

model = Model(input_,output)
model.compile(
    loss = 'binary_crossentropy',
    optimizer = 'rmsprop',
    metrics = ['accuracy']
)

model.summary()

In [None]:
print('Training model....')
r = model.fit(
    data,
    targets,
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,
    validation_split = VALIDATION_SPLIT
)


In [None]:
#plot loss
plt.plot(r.history['loss'], label = 'loss')
plt.plot(r.history['val_loss'], label = 'val_loss')
plt.legend()
plt.show()


#plot accuracy
plt.plot(r.history['accuracy'], label = 'acc')
plt.plot(r.history['val_accuracy'], label = 'val_acc')
plt.legend()
plt.show()

#plot mean AUC over each label
p = model.predict(data)
aucs = []
for j in range(6):
    auc = roc_auc_score(targets[:,j], p[:,j])
    aucs.append(auc)
print(np.mean(aucs))