In [None]:
import warnings
warnings.simplefilter("ignore")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import gc
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
import keras.backend as K
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.initializers import Constant
from keras.layers import (Input, Embedding, SpatialDropout1D, Bidirectional,
                          GlobalAveragePooling1D, GlobalMaxPooling1D,
                          LSTM, CuDNNLSTM, GRU, CuDNNGRU, concatenate, Dropout, Dense, Activation,
                          Lambda, Flatten, RepeatVector, Permute, Multiply)
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

In [None]:
MAXLEN = 1000
EMBEDDING_DIM = 100

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data):
        self.X_val, self.y_val = validation_data
    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_val, verbose=0)
        score = roc_auc_score(self.y_val, y_pred)
        print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train['comment_text'].fillna("", inplace=True)
test['comment_text'].fillna("", inplace=True)
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
X_train_raw, y_train = train['comment_text'].str.lower(), train[classes].values
X_test_raw = test['comment_text'].str.lower()
tk = Tokenizer(char_level=True)
tk.fit_on_texts(X_train_raw)
X_train = tk.texts_to_sequences(X_train_raw)
X_test = tk.texts_to_sequences(X_test_raw)
X_train = pad_sequences(X_train, maxlen=MAXLEN)
X_test = pad_sequences(X_test, maxlen=MAXLEN)
nb_words = len(tk.word_index) + 1
del train, test, X_train_raw, X_test_raw, tk

In [None]:
def build_model():
    inputs = Input(shape=(MAXLEN,))
    x = Embedding(nb_words, EMBEDDING_DIM)(inputs)
    x = SpatialDropout1D(0.25)(x)
    rnn_1 = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    rnn_2 = Bidirectional(CuDNNGRU(64, return_sequences=True))(rnn_1)
    x = concatenate([rnn_1, rnn_2])
    x1 = GlobalMaxPooling1D()(x)
    x2 = GlobalAveragePooling1D()(x)
    x3 = Lambda(lambda x: x[:, -1])(x)
    x4 = Dense(1)(x)
    x4 = Flatten()(x4)
    x4 = Activation("softmax")(x4)
    x4 = RepeatVector(256)(x4)
    x4 = Permute((2, 1))(x4)
    x4 = Multiply()([x, x4])
    x4 = Lambda(lambda x: K.sum(x, axis=1))(x4)
    hidden = concatenate([x1, x2, x3, x4])
    x = Dropout(0.5)(hidden)
    x = Dense(128, activation="relu")(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer=Adam(decay=1e-6, clipvalue=5), metrics=['accuracy'])
    return model

In [None]:
n_splits = 10
kf = KFold(n_splits=10, shuffle=True, random_state=0)
for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    model = build_model()
    roc_eval = RocAucEvaluation(validation_data=(X_train[test_index], y_train[test_index]))
    early_stopping = EarlyStopping(patience=5, restore_best_weights=True)
    gc.collect()
    model.fit(X_train[train_index], y_train[train_index], batch_size=128, epochs=50,
              validation_data=(X_train[test_index], y_train[test_index]),
              callbacks = [roc_eval, early_stopping])
    pred = model.predict(X_test, batch_size=1024, verbose=1)
    submission = pd.read_csv("data/sample_submission.csv")
    submission[classes] = pred
    submission.to_csv("submission/v5_GRU_preprocess_char_submission_" + str(i) + ".csv.gz",
                      compression="gzip", index=False)

In [None]:
df1 = pd.read_csv("../v5_GRU_preprocess_char_submission_0.csv.gz")
df2 = pd.read_csv("../v5_GRU_preprocess_char_submission_1.csv.gz")
df3 = pd.read_csv("../v5_GRU_preprocess_char_submission_2.csv.gz")
df4 = pd.read_csv("../v5_GRU_preprocess_char_submission_3.csv.gz")
df5 = pd.read_csv("../v5_GRU_preprocess_char_submission_4.csv.gz")
df6 = pd.read_csv("../v5_GRU_preprocess_char_submission_5.csv.gz")
df7 = pd.read_csv("../v5_GRU_preprocess_char_submission_6.csv.gz")
df8 = pd.read_csv("../v5_GRU_preprocess_char_submission_7.csv.gz")
df9 = pd.read_csv("../v5_GRU_preprocess_char_submission_8.csv.gz")
df10 = pd.read_csv("../v5_GRU_preprocess_char_submission_9.csv.gz")
temp = df1.copy()
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
for col in cols:
    temp[col] = (df1[col] + df2[col] + df3[col] + df4[col] + df5[col] + df6[col] + df7[col] + df8[col] + df9[col] + df10[col]) / 10
temp.to_csv("../v5_GRU_preprocess_char_submission.csv.gz", compression="gzip", index=False)