In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import random
from sklearn.model_selection import train_test_split

SEED = 0
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [None]:
!unzip ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
!unzip ../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
!unzip ../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
!unzip ../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip

In [None]:
df = pd.read_csv("train.csv")

In [None]:
df_tst_label = pd.read_csv("test_labels.csv")

In [None]:
df_tst_label.head(10)

In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
df_test.head()

In [None]:
train_X = df["comment_text"].values
test_X = df_test["comment_text"].values
train_y = df.iloc[:, 2:].values

In [None]:
from keras.preprocessing import text, sequence
max_num_words = 30000
tokenizer = text.Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(list(train_X) + list(test_X) )

train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)

In [None]:
import seaborn as sns
sentence_lengths = [len(sentence) for sentence in train_X]
sns.distplot(sentence_lengths);

max_length = 400
train_X = sequence.pad_sequences(train_X, maxlen=max_length)
test_X = sequence.pad_sequences(test_X, maxlen=max_length)

In [None]:
#Replace this line with a path to the glove embeddings file which you can download here: https://www.kaggle.com/watts2/glove6b50dtxt
EMBEDDING_FILE =  '../input/glove6b50dtxt/glove.6B.50d.txt'   

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
max_number_words = 30000
embedding_dimension = 50
number_words = min(max_number_words, len(word_index))
embedding_matrix = np.zeros((number_words, embedding_dimension))
for word, i in word_index.items():
    if i >= max_number_words: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
from tensorflow.keras import Model, activations
from tensorflow.keras.layers import Dense, Concatenate, GRU, LSTM, SpatialDropout1D, \
Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Embedding

gru_hidden_size = 40
dropout_rate = 0.1

class gru_model(Model):

  def __init__(self):
    Model.__init__(self)
    self.gru = Bidirectional(GRU(units=gru_hidden_size, return_sequences=True))
    #We use spatial dropout instead of dropout because the different dimensions of an embedding are likely to be highly correlated and so it is a more effective method of regularisation to drop whole embedding
    #vectors at a time rather than only dropping parts of embedding vectors
    self.spatial_dropout = SpatialDropout1D(dropout_rate) 
    self.global_avg_pooling = GlobalAveragePooling1D()
    self.global_max_pooling = GlobalMaxPooling1D()
    self.embedding = Embedding(max_number_words, embedding_dimension, input_length=max_length, weights=[embedding_matrix])
    self.fc_layer = Dense(6, activation="sigmoid")
  
  def call(self, x, training=True):
    """Forward pass for the network. Note that it expects input data in the form (batch, seq length, features)"""
    x = self.embedding(x)
    if training:
      x = self.spatial_dropout(x)
    x = self.gru(x)
    avg_pool = self.global_avg_pooling(x)
    max_pool = self.global_max_pooling(x)
    x = Concatenate(axis=1)([avg_pool, max_pool])
    x = self.fc_layer(x)
    return x

In [None]:
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score
                                          
class ROCAUCEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [None]:
batch_size = 32
epochs = 1
model = gru_model()
optimizer = tf.keras.optimizers.Adam()
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])                            
tr_X, val_X, tr_y, val_y = train_test_split(train_X, train_y, train_size=0.95, random_state=SEED)
rocauc = ROCAUCEvaluation(validation_data=(val_X, val_y), interval=1)
hist = model.fit(tr_X, tr_y, batch_size=batch_size, epochs=epochs, validation_data=(val_X, val_y), callbacks=[rocauc]) 

In [None]:
#How to create a submission csv file for Kaggle
df_sample = pd.read_csv("sample_submission.csv")
y_pred = model.predict(test_X, batch_size=1024)
df_sample[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
df_sample.to_csv('submission.csv', index=False)