In [1]:
import warnings
warnings.simplefilter("ignore")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.initializers import Constant
from keras.layers import Embedding, LSTM, CuDNNLSTM, GRU, CuDNNGRU, Dense, Activation
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
NUM_WORDS = 50000
MAXLEN = 250
EMBEDDING_DIM = 300

In [3]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data):
        self.X_val, self.y_val = validation_data
    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_val, verbose=0)
        score = roc_auc_score(self.y_val, y_pred)
        print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
X_train_raw, y_train = train['comment_text'].str.lower(), train[classes].values
X_test_raw = test['comment_text'].str.lower()
tk = Tokenizer(num_words=NUM_WORDS)
tk.fit_on_texts(X_train_raw)
X_train = tk.texts_to_sequences(X_train_raw)
X_test = tk.texts_to_sequences(X_test_raw)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=0)
X_train = pad_sequences(X_train, maxlen=MAXLEN)
X_valid = pad_sequences(X_valid, maxlen=MAXLEN)
X_test = pad_sequences(X_test, maxlen=MAXLEN)
nb_words = min(NUM_WORDS, len(tk.word_index) + 1)

In [5]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def get_embedding(pretrained_word_vectors):
    if pretrained_word_vectors == "google":
        EMBEDDING_PATH = "../embeddings/GoogleNews-vectors-negative300.bin"
        embedding_index = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_PATH, binary=True)
    if pretrained_word_vectors == "glove":
        EMBEDDING_PATH = "../embeddings/glove.840B.300d.txt"
        embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_PATH))
    elif pretrained_word_vectors == "fasttext":
        EMBEDDING_PATH = "../embeddings/crawl-300d-2M.vec"
        embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_PATH))
    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, i in tk.word_index.items():
        if i >= NUM_WORDS:
            continue
        if pretrained_word_vectors == "google":
            try:
                embedding_vector = embedding_index.get_vector(word)
            except Exception:
                continue
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [6]:
def build_model_LSTM(embedding_matrix):
    model = Sequential()
    if embedding_matrix is None:
        model.add(Embedding(nb_words, EMBEDDING_DIM, input_length=MAXLEN))
    else:
        model.add(Embedding(nb_words, EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAXLEN, trainable=False))
    model.add(CuDNNLSTM(128))
    model.add(Dense(6, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
    return model

In [7]:
MODEL_PATH = "model/v3_LSTM_best_model.hdf5"
model = build_model_LSTM(embedding_matrix=None)
roc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid))
check_point =  ModelCheckpoint(filepath=MODEL_PATH, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model.fit(X_train, y_train, batch_size=128, epochs=10,
          validation_data=(X_valid, y_valid),
          callbacks = [roc_eval, check_point, early_stopping])
pred = model.predict(X_test, batch_size=1024, verbose=1)
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v3_LSTM_submission.csv.gz", compression="gzip", index=False)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.974557

Epoch 00001: val_loss improved from inf to 0.04906, saving model to model/v3_LSTM_best_model.hdf5
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.979186

Epoch 00002: val_loss improved from 0.04906 to 0.04631, saving model to model/v3_LSTM_best_model.hdf5
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.977639

Epoch 00003: val_loss did not improve from 0.04631
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.977205

Epoch 00004: val_loss did not improve from 0.04631
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.975337

Epoch 00005: val_loss did not improve from 0.04631


In [8]:
MODEL_PATH = "model/v3_LSTM_google_best_model.hdf5"
embedding_matrix = get_embedding("google")
roc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid))
check_point =  ModelCheckpoint(filepath=MODEL_PATH, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model = build_model_LSTM(embedding_matrix=embedding_matrix)
model.fit(X_train, y_train, batch_size=128, epochs=10,
          validation_data=(X_valid, y_valid),
          callbacks = [roc_eval, check_point, early_stopping])
pred = model.predict(X_test, batch_size=1024, verbose=1)
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v3_LSTM_google_submission.csv.gz", compression="gzip", index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.970203

Epoch 00001: val_loss improved from inf to 0.05346, saving model to model/v3_LSTM_google_best_model.hdf5
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.974772

Epoch 00002: val_loss improved from 0.05346 to 0.05027, saving model to model/v3_LSTM_google_best_model.hdf5
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.977348

Epoch 00003: val_loss improved from 0.05027 to 0.04870, saving model to model/v3_LSTM_google_best_model.hdf5
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.981651

Epoch 00004: val_loss improved from 0.04870 to 0.04744, saving model to model/v3_LSTM_google_best_model.hdf5
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.983273

Epoch 00005: val_loss improved from 0.04744 to 0.04549, saving model to model/v3_LSTM_google_best_model.hdf5
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.983657

Epoch 00006: val_loss improved from 0.04549 to 0.04549, saving model to model/v3_LSTM_google_best_mode

In [9]:
MODEL_PATH = "model/v3_LSTM_glove_best_model.hdf5"
embedding_matrix = get_embedding("glove")
roc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid))
check_point =  ModelCheckpoint(filepath=MODEL_PATH, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model = build_model_LSTM(embedding_matrix=embedding_matrix)
model.fit(X_train, y_train, batch_size=128, epochs=10,
          validation_data=(X_valid, y_valid),
          callbacks = [roc_eval, check_point, early_stopping])
pred = model.predict(X_test, batch_size=1024, verbose=1)
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v3_LSTM_glove_submission.csv.gz", compression="gzip", index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.975355

Epoch 00001: val_loss improved from inf to 0.05007, saving model to model/v3_LSTM_glove_best_model.hdf5
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.984810

Epoch 00002: val_loss improved from 0.05007 to 0.04353, saving model to model/v3_LSTM_glove_best_model.hdf5
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.987335

Epoch 00003: val_loss improved from 0.04353 to 0.04210, saving model to model/v3_LSTM_glove_best_model.hdf5
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.987871

Epoch 00004: val_loss improved from 0.04210 to 0.04169, saving model to model/v3_LSTM_glove_best_model.hdf5
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.988835

Epoch 00005: val_loss improved from 0.04169 to 0.04090, saving model to model/v3_LSTM_glove_best_model.hdf5
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.988734

Epoch 00006: val_loss did not improve from 0.04090
Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.988596

Epoch

In [10]:
MODEL_PATH = "model/v3_LSTM_fasttext_best_model.hdf5"
embedding_matrix = get_embedding("fasttext")
roc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid))
check_point =  ModelCheckpoint(filepath=MODEL_PATH, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model = build_model_LSTM(embedding_matrix=embedding_matrix)
model.fit(X_train, y_train, batch_size=128, epochs=10,
          validation_data=(X_valid, y_valid),
          callbacks = [roc_eval, check_point, early_stopping])
pred = model.predict(X_test, batch_size=1024, verbose=1)
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v3_LSTM_fasttext_submission.csv.gz", compression="gzip", index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.973722

Epoch 00001: val_loss improved from inf to 0.05008, saving model to model/v3_LSTM_fasttext_best_model.hdf5
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.982362

Epoch 00002: val_loss improved from 0.05008 to 0.04507, saving model to model/v3_LSTM_fasttext_best_model.hdf5
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.986576

Epoch 00003: val_loss improved from 0.04507 to 0.04280, saving model to model/v3_LSTM_fasttext_best_model.hdf5
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.988374

Epoch 00004: val_loss improved from 0.04280 to 0.04084, saving model to model/v3_LSTM_fasttext_best_model.hdf5
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.986917

Epoch 00005: val_loss did not improve from 0.04084
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.989014

Epoch 00006: val_loss did not improve from 0.04084
Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.989082

Epoch 00007: val_loss improved from 0.04084 to 0.0

In [6]:
def build_model_GRU(embedding_matrix):
    model = Sequential()
    if embedding_matrix is None:
        model.add(Embedding(nb_words, EMBEDDING_DIM, input_length=MAXLEN))
    else:
        model.add(Embedding(nb_words, EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAXLEN, trainable=False))
    model.add(CuDNNGRU(128))
    model.add(Dense(6, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
    return model

In [7]:
MODEL_PATH = "model/v3_GRU_best_model.hdf5"
model = build_model_GRU(embedding_matrix=None)
roc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid))
check_point =  ModelCheckpoint(filepath=MODEL_PATH, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model.fit(X_train, y_train, batch_size=128, epochs=10,
          validation_data=(X_valid, y_valid),
          callbacks = [roc_eval, check_point, early_stopping])
pred = model.predict(X_test, batch_size=1024, verbose=1)
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v3_GRU_submission.csv.gz", compression="gzip", index=False)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.978172

Epoch 00001: val_loss improved from inf to 0.04624, saving model to model/v3_GRU_best_model.hdf5
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.984115

Epoch 00002: val_loss improved from 0.04624 to 0.04363, saving model to model/v3_GRU_best_model.hdf5
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.985164

Epoch 00003: val_loss did not improve from 0.04363
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.982400

Epoch 00004: val_loss did not improve from 0.04363
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.979016

Epoch 00005: val_loss did not improve from 0.04363


In [8]:
MODEL_PATH = "model/v3_GRU_google_best_model.hdf5"
embedding_matrix = get_embedding("google")
roc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid))
check_point =  ModelCheckpoint(filepath=MODEL_PATH, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model = build_model_GRU(embedding_matrix=embedding_matrix)
model.fit(X_train, y_train, batch_size=128, epochs=10,
          validation_data=(X_valid, y_valid),
          callbacks = [roc_eval, check_point, early_stopping])
pred = model.predict(X_test, batch_size=1024, verbose=1)
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v3_GRU_google_submission.csv.gz", compression="gzip", index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.979618

Epoch 00001: val_loss improved from inf to 0.04888, saving model to model/v3_GRU_google_best_model.hdf5
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.983551

Epoch 00002: val_loss improved from 0.04888 to 0.04653, saving model to model/v3_GRU_google_best_model.hdf5
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.985003

Epoch 00003: val_loss improved from 0.04653 to 0.04482, saving model to model/v3_GRU_google_best_model.hdf5
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.985244

Epoch 00004: val_loss improved from 0.04482 to 0.04480, saving model to model/v3_GRU_google_best_model.hdf5
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.985801

Epoch 00005: val_loss improved from 0.04480 to 0.04359, saving model to model/v3_GRU_google_best_model.hdf5
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.985876

Epoch 00006: val_loss improved from 0.04359 to 0.04328, saving model to model/v3_GRU_google_best_model.hdf5

In [9]:
MODEL_PATH = "model/v3_GRU_glove_best_model.hdf5"
embedding_matrix = get_embedding("glove")
roc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid))
check_point =  ModelCheckpoint(filepath=MODEL_PATH, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model = build_model_GRU(embedding_matrix=embedding_matrix)
model.fit(X_train, y_train, batch_size=128, epochs=10,
          validation_data=(X_valid, y_valid),
          callbacks = [roc_eval, check_point, early_stopping])
pred = model.predict(X_test, batch_size=1024, verbose=1)
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v3_GRU_glove_submission.csv.gz", compression="gzip", index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.987657

Epoch 00001: val_loss improved from inf to 0.04357, saving model to model/v3_GRU_glove_best_model.hdf5
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.989261

Epoch 00002: val_loss improved from 0.04357 to 0.04129, saving model to model/v3_GRU_glove_best_model.hdf5
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.989870

Epoch 00003: val_loss improved from 0.04129 to 0.03990, saving model to model/v3_GRU_glove_best_model.hdf5
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.990013

Epoch 00004: val_loss improved from 0.03990 to 0.03984, saving model to model/v3_GRU_glove_best_model.hdf5
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.990016

Epoch 00005: val_loss improved from 0.03984 to 0.03964, saving model to model/v3_GRU_glove_best_model.hdf5
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.989794

Epoch 00006: val_loss did not improve from 0.03964
Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.989543

Epoch 0000

In [11]:
MODEL_PATH = "model/v3_GRU_fasttext_best_model.hdf5"
embedding_matrix = get_embedding("fasttext")
roc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid))
check_point =  ModelCheckpoint(filepath=MODEL_PATH, verbose=1, save_best_only=True)
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model = build_model_GRU(embedding_matrix=embedding_matrix)
model.fit(X_train, y_train, batch_size=128, epochs=10,
          validation_data=(X_valid, y_valid),
          callbacks = [roc_eval, check_point, early_stopping])
pred = model.predict(X_test, batch_size=1024, verbose=1)
submission = pd.read_csv("data/sample_submission.csv")
submission[classes] = pred
submission.to_csv("submission/v3_GRU_fasttext_submission.csv.gz", compression="gzip", index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.986463

Epoch 00001: val_loss improved from inf to 0.04317, saving model to model/v3_GRU_fasttext_best_model.hdf5
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.989123

Epoch 00002: val_loss improved from 0.04317 to 0.04153, saving model to model/v3_GRU_fasttext_best_model.hdf5
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.989647

Epoch 00003: val_loss improved from 0.04153 to 0.04034, saving model to model/v3_GRU_fasttext_best_model.hdf5
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.990004

Epoch 00004: val_loss improved from 0.04034 to 0.03955, saving model to model/v3_GRU_fasttext_best_model.hdf5
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.989995

Epoch 00005: val_loss did not improve from 0.03955
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.989620

Epoch 00006: val_loss did not improve from 0.03955
Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.989951

Epoch 00007: val_loss did not improve from 0.03955
