In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
import re
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU
from keras.preprocessing import text, sequence
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras import backend as K

import os
os.environ['OMP_NUM_THREADS'] = '16'

import warnings
warnings.filterwarnings('ignore')

EMBEDDING_FILE = './data/glove.42B.300d.txt'

train = pd.read_csv('./data/train.csv.zip')
test = pd.read_csv('./data/test.csv.zip')
submission = pd.read_csv('./data/sample_submission.csv.zip')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
max_features = 30000
maxlen = 100
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [3]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [9]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True))(x)
    x = SpatialDropout1D(0.5)(x)
    x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    adam = Adam(lr=0.0001, decay=0.000001)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])

    return model

In [5]:
#1.20

In [None]:
%%time
gru_for_stack = pd.DataFrame.from_dict({'id': train['id'],
                                        "toxic":0,
                                        "severe_toxic":0,
                                        "obscene":0,
                                        "threat":0,
                                        "insult":0,
                                        "identity_hate":0})

batch_size = 1000
epochs = 20
cv = KFold(n_splits=5, random_state=0, shuffle=True)
i = 0

for train_index, test_index in cv.split(x_train, y_train):
    X_tra, X_test = x_train[train_index], x_train[test_index]
    y_tra, y_test = y_train[train_index], y_train[test_index]
    i += 1
    
    model = get_model()
    file_path=f"./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_fold_{i}.best.hdf5"
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

    early = EarlyStopping(monitor="val_loss", mode="min", patience=3)
    callbacks_list = [checkpoint, early] #early
    #exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
    #steps = int(len(X_tra)/batch_size) * epochs
    #lr_init, lr_fin = 0.001, 0.0005
    #lr_decay = exp_decay(lr_init, lr_fin, steps)
    #K.set_value(model.optimizer.lr, lr_init)
    #K.set_value(model.optimizer.decay, lr_decay)

    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list, validation_data=(X_test, y_test))

    model.load_weights(file_path)

    y_pred = model.predict(X_test, batch_size=1024)
    gru_for_stack.loc[test_index,["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
gru_for_stack.to_csv('./submits/gru_100_0.4_200_0.5_100_0.4_stack_layer.csv', index=False)

Train on 127656 samples, validate on 31915 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.06112, saving model to ./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_fold_1.best.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.06112 to 0.05155, saving model to ./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_fold_1.best.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.05155 to 0.04866, saving model to ./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_fold_1.best.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 0.04866 to 0.04780, saving model to ./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_fold_1.best.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.04780 to 0.04718, saving model to ./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_fold_1.best.hdf5
Epoch 6/20

Epoch 00006: val_loss improved from 0.04718 to 0.04564, saving model to ./additio

In [10]:
%%time
batch_size = 1000
epochs = 20

model = get_model()
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=0)
model = get_model()
file_path=f"./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_submit.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=3)
callbacks_list = [checkpoint, early] #earlyist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list, validation_data=(X_test, y_test))
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list, validation_data=(X_val, y_val), verbose=2)

model.load_weights(file_path)
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('./submits/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_submit.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/20
 - 57s - loss: 0.2082 - acc: 0.9316 - val_loss: 0.1370 - val_acc: 0.9634

Epoch 00001: val_loss improved from inf to 0.13705, saving model to ./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_submit.best.hdf5
Epoch 2/20
 - 56s - loss: 0.1263 - acc: 0.9639 - val_loss: 0.0824 - val_acc: 0.9706

Epoch 00002: val_loss improved from 0.13705 to 0.08236, saving model to ./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_submit.best.hdf5
Epoch 3/20
 - 56s - loss: 0.0738 - acc: 0.9750 - val_loss: 0.0567 - val_acc: 0.9806

Epoch 00003: val_loss improved from 0.08236 to 0.05666, saving model to ./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.4_layers_submit.best.hdf5
Epoch 4/20
 - 56s - loss: 0.0606 - acc: 0.9789 - val_loss: 0.0515 - val_acc: 0.9816

Epoch 00004: val_loss improved from 0.05666 to 0.05149, saving model to ./additional_matiriels/weights_base_gru_100_0.4_200_0.5_100_0.