In [1]:
import warnings
warnings.simplefilter("ignore")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
import keras.backend as K
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.initializers import Constant
from keras.layers import (Input, Embedding, SpatialDropout1D, Bidirectional,
                          GlobalAveragePooling1D, GlobalMaxPooling1D,
                          LSTM, CuDNNLSTM, GRU, CuDNNGRU, concatenate, Dropout, Dense, Activation,
                          Lambda, Flatten, RepeatVector, Permute, Multiply)
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

Using TensorFlow backend.


In [2]:
NUM_WORDS = 100000
MAXLEN = 400
EMBEDDING_DIM = 300

In [3]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data):
        self.X_val, self.y_val = validation_data
    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_val, verbose=0)
        score = roc_auc_score(self.y_val, y_pred)
        print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))

In [4]:
train = pd.read_csv('data/train_preprocessed.csv')
test = pd.read_csv('data/test_preprocessed.csv')
train['comment_text'].fillna("", inplace=True)
test['comment_text'].fillna("", inplace=True)
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
X_train_raw, y_train = train['comment_text'].str.lower(), train[classes].values
X_test_raw = test['comment_text'].str.lower()
tk = Tokenizer(num_words=NUM_WORDS)
tk.fit_on_texts(X_train_raw)
X_train = tk.texts_to_sequences(X_train_raw)
X_test = tk.texts_to_sequences(X_test_raw)
X_train = pad_sequences(X_train, maxlen=MAXLEN)
X_test = pad_sequences(X_test, maxlen=MAXLEN)
nb_words = min(NUM_WORDS, len(tk.word_index) + 1)

In [5]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def get_embedding(pretrained_word_vectors):
    if pretrained_word_vectors == "google":
        EMBEDDING_PATH = "../embeddings/GoogleNews-vectors-negative300.bin"
        embedding_index = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_PATH, binary=True)
    if pretrained_word_vectors == "glove":
        EMBEDDING_PATH = "../embeddings/glove.840B.300d.txt"
        embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_PATH))
    elif pretrained_word_vectors == "fasttext":
        EMBEDDING_PATH = "../embeddings/crawl-300d-2M.vec"
        embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_PATH))
    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, i in tk.word_index.items():
        if i >= NUM_WORDS:
            continue
        if pretrained_word_vectors == "google":
            try:
                embedding_vector = embedding_index.get_vector(word)
            except Exception:
                continue
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [6]:
def build_model(embedding_matrix):
    inputs = Input(shape=(MAXLEN,))
    x = Embedding(nb_words, EMBEDDING_DIM,
                  embeddings_initializer=Constant(embedding_matrix),
                  trainable=False)(inputs)
    x = SpatialDropout1D(0.25)(x)
    rnn_1 = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    rnn_2 = Bidirectional(CuDNNGRU(64, return_sequences=True))(rnn_1)
    x = concatenate([rnn_1, rnn_2])
    x1 = GlobalMaxPooling1D()(x)
    x2 = GlobalAveragePooling1D()(x)
    x3 = Lambda(lambda x: x[:, -1])(x)
    x4 = Dense(1)(x)
    x4 = Flatten()(x4)
    x4 = Activation("softmax")(x4)
    x4 = RepeatVector(256)(x4)
    x4 = Permute((2, 1))(x4)
    x4 = Multiply()([x, x4])
    x4 = Lambda(lambda x: K.sum(x, axis=1))(x4)
    hidden = concatenate([x1, x2, x3, x4])
    x = Dropout(0.5)(hidden)
    x = Dense(128, activation="relu")(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer=Adam(decay=1e-6, clipvalue=5), metrics=['accuracy'])
    return model

In [7]:
embedding_matrix = get_embedding("fasttext")

In [8]:
n_splits = 10
kf = KFold(n_splits=10, shuffle=True, random_state=0)
train_pred = np.zeros((X_train.shape[0], 6))
pred = np.zeros((X_test.shape[0], 6))
for train_index, test_index in kf.split(X_train, y_train):
    model = build_model(embedding_matrix=embedding_matrix)
    roc_eval = RocAucEvaluation(validation_data=(X_train[test_index], y_train[test_index]))
    early_stopping = EarlyStopping(patience=5, restore_best_weights=True)
    model.fit(X_train[train_index], y_train[train_index], batch_size=128, epochs=50,
              validation_data=(X_train[test_index], y_train[test_index]),
              callbacks = [roc_eval, early_stopping])
    train_pred[test_index] = model.predict(X_train[test_index], batch_size=1024, verbose=1)
    pred += model.predict(X_test, batch_size=1024, verbose=1)
print("oof auc :", roc_auc_score(y_train, train_pred))
pred /= n_splits
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v5_GRU_fasttext_preprocess_submission.csv.gz", compression="gzip", index=False)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 143613 samples, validate on 15958 samples
Epoch 1/50

 ROC-AUC - epoch: 1 - score: 0.984796
Epoch 2/50

 ROC-AUC - epoch: 2 - score: 0.989236
Epoch 3/50

 ROC-AUC - epoch: 3 - score: 0.990152
Epoch 4/50

 ROC-AUC - epoch: 4 - score: 0.989737
Epoch 5/50

 ROC-AUC - epoch: 5 - score: 0.990696
Epoch 6/50

 ROC-AUC - epoch: 6 - score: 0.990599
Epoch 7/50

 ROC-AUC - epoch: 7 - score: 0.990932
Epoch 8/50

 ROC-AUC - epoch: 8 - score: 0.990456
Epoch 9/50

 ROC-AUC - epoch: 9 - score: 0.990556
Epoch 10/50

 ROC-AUC - epoch: 10 - score: 0.990316
Epoch 11/50

 ROC-AUC - epoch: 11 - score: 0.990010
Epoch 12/50

 ROC-AUC - epoch: 12 - score: 0.989711
Epoch 13/50

 ROC-AUC - epoch:


 ROC-AUC - epoch: 9 - score: 0.991599
Epoch 10/50

 ROC-AUC - epoch: 10 - score: 0.991593
Epoch 11/50

 ROC-AUC - epoch: 11 - score: 0.991025
Epoch 12/50

 ROC-AUC - epoch: 12 - score: 0.990891
Epoch 13/50

 ROC-AUC - epoch: 13 - score: 0.990383
Train on 143614 samples, validate on 15957 samples
Epoch 1/50

 ROC-AUC - epoch: 1 - score: 0.985710
Epoch 2/50

 ROC-AUC - epoch: 2 - score: 0.988269
Epoch 3/50

 ROC-AUC - epoch: 3 - score: 0.989362
Epoch 4/50

 ROC-AUC - epoch: 4 - score: 0.989625
Epoch 5/50

 ROC-AUC - epoch: 5 - score: 0.989043
Epoch 6/50

 ROC-AUC - epoch: 6 - score: 0.989434
Epoch 7/50

 ROC-AUC - epoch: 7 - score: 0.988811
Epoch 8/50

 ROC-AUC - epoch: 8 - score: 0.988712
Epoch 9/50

 ROC-AUC - epoch: 9 - score: 0.988384
Epoch 10/50

 ROC-AUC - epoch: 10 - score: 0.987872
Train on 143614 samples, validate on 15957 samples
Epoch 1/50

 ROC-AUC - epoch: 1 - score: 0.985553
Epoch 2/50

 ROC-AUC - epoch: 2 - score: 0.987721
Epoch 3/50

 ROC-AUC - epoch: 3 - score: 0.987701


 ROC-AUC - epoch: 4 - score: 0.990946
Epoch 5/50

 ROC-AUC - epoch: 5 - score: 0.991412
Epoch 6/50

 ROC-AUC - epoch: 6 - score: 0.991152
Epoch 7/50

 ROC-AUC - epoch: 7 - score: 0.991213
Epoch 8/50

 ROC-AUC - epoch: 8 - score: 0.991324
Epoch 9/50

 ROC-AUC - epoch: 9 - score: 0.991306
Epoch 10/50

 ROC-AUC - epoch: 10 - score: 0.991336
Train on 143614 samples, validate on 15957 samples
Epoch 1/50

 ROC-AUC - epoch: 1 - score: 0.982929
Epoch 2/50

 ROC-AUC - epoch: 2 - score: 0.986398
Epoch 3/50

 ROC-AUC - epoch: 3 - score: 0.988839
Epoch 4/50

 ROC-AUC - epoch: 4 - score: 0.989133
Epoch 5/50

 ROC-AUC - epoch: 5 - score: 0.987778
Epoch 6/50

 ROC-AUC - epoch: 6 - score: 0.990269
Epoch 7/50

 ROC-AUC - epoch: 7 - score: 0.989947
Epoch 8/50

 ROC-AUC - epoch: 8 - score: 0.991127
Epoch 9/50

 ROC-AUC - epoch: 9 - score: 0.990387
Epoch 10/50

 ROC-AUC - epoch: 10 - score: 0.990879
Epoch 11/50

 ROC-AUC - epoch: 11 - score: 0.990420
Epoch 12/50

 ROC-AUC - epoch: 12 - score: 0.990454
Tr