In [1]:
import warnings
warnings.simplefilter("ignore")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.initializers import Constant
from keras.layers import (Input, Embedding, SpatialDropout1D, Bidirectional,
                          GlobalAveragePooling1D, GlobalMaxPooling1D, add,
                          LSTM, CuDNNLSTM, GRU, CuDNNGRU, concatenate, Dropout, Dense, Activation)
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
NUM_WORDS = 50000
MAXLEN = 250
EMBEDDING_DIM = 300

In [3]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data):
        self.X_val, self.y_val = validation_data
    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_val, verbose=0)
        score = roc_auc_score(self.y_val, y_pred)
        print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))

In [4]:
train = pd.read_csv('data/train_preprocessed.csv')
test = pd.read_csv('data/test_preprocessed.csv')
train['comment_text'].fillna("", inplace=True)
test['comment_text'].fillna("", inplace=True)
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
X_train_raw, y_train = train['comment_text'].str.lower(), train[classes].values
X_test_raw = test['comment_text'].str.lower()
tk = Tokenizer(num_words=NUM_WORDS)
tk.fit_on_texts(X_train_raw)
X_train = tk.texts_to_sequences(X_train_raw)
X_test = tk.texts_to_sequences(X_test_raw)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=0)
X_train = pad_sequences(X_train, maxlen=MAXLEN)
X_valid = pad_sequences(X_valid, maxlen=MAXLEN)
X_test = pad_sequences(X_test, maxlen=MAXLEN)
nb_words = min(NUM_WORDS, len(tk.word_index) + 1)

In [5]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def get_embedding(pretrained_word_vectors):
    if pretrained_word_vectors == "google":
        EMBEDDING_PATH = "../embeddings/GoogleNews-vectors-negative300.bin"
        embedding_index = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_PATH, binary=True)
    if pretrained_word_vectors == "glove":
        EMBEDDING_PATH = "../embeddings/glove.840B.300d.txt"
        embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_PATH))
    elif pretrained_word_vectors == "fasttext":
        EMBEDDING_PATH = "../embeddings/crawl-300d-2M.vec"
        embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_PATH))
    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, i in tk.word_index.items():
        if i >= NUM_WORDS:
            continue
        if pretrained_word_vectors == "google":
            try:
                embedding_vector = embedding_index.get_vector(word)
            except Exception:
                continue
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [6]:
def build_model(embedding_matrix):
    inputs = Input(shape=(MAXLEN,))
    x = Embedding(nb_words, EMBEDDING_DIM,
                  embeddings_initializer=Constant(embedding_matrix),
                  trainable=False)(inputs)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    hidden = concatenate([GlobalMaxPooling1D()(x), GlobalAveragePooling1D()(x)])
    #hidden = add([hidden, Dense(512, activation='relu')(hidden)])
    #hidden = add([hidden, Dense(512, activation='relu')(hidden)])
    outputs = Dense(6, activation="sigmoid")(hidden)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
    return model

In [8]:
embedding_matrix = get_embedding("glove")
roc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid))
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
n_models = 5
pred = np.zeros((X_test.shape[0], 6))
for _ in range(n_models):
    model = build_model(embedding_matrix=embedding_matrix)
    model.fit(X_train, y_train, batch_size=128, epochs=10,
              validation_data=(X_valid, y_valid),
              callbacks = [roc_eval, early_stopping])
    pred += model.predict(X_test, batch_size=1024, verbose=1)
pred /= n_models
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v5_GRU_glove_preprocess_submission.csv.gz", compression="gzip", index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.986448
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.989951
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.990815
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.990653
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.990885
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.990913
Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.990401
Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.986578
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.989711
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.990187
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.990834
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.990897
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.990565
Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.990306
Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.987225
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.989082
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.9905

In [7]:
embedding_matrix = get_embedding("fasttext")
roc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid))
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
n_models = 5
pred = np.zeros((X_test.shape[0], 6))
for _ in range(n_models):
    model = build_model(embedding_matrix=embedding_matrix)
    model.fit(X_train, y_train, batch_size=128, epochs=10,
              validation_data=(X_valid, y_valid),
              callbacks = [roc_eval, early_stopping])
    pred += model.predict(X_test, batch_size=1024, verbose=1)
pred /= n_models
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v5_GRU_fasttext_preprocess_submission.csv.gz", compression="gzip", index=False)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.987621
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.989989
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.990684
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.990988
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.990862
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.990784
Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.990488
Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.986727
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.989490
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.990581
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.990188
Epoch 5/10

 ROC-AUC - epoch: 5 - s