In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout
from keras.layers import CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, Conv1D, LSTM
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam
from keras.layers import merge
from keras.layers.core import *
from keras.models import *
from keras.utils import plot_model

from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')



EMBEDDING_FILE = '../data/crawl-300d-2M.vec'

train = pd.read_csv('../data/train.csv.zip')
test = pd.read_csv('../data/test.csv.zip')
submission = pd.read_csv('../data/sample_submission.csv.zip')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
max_features = 60000
maxlen = 400
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)


def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [9]:
def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, 400))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(400, activation='softmax')(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    print(inputs)
    print(a_probs)
    output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    return output_attention_mul

In [19]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model(units=80):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = attention_3d_block(x)
    x = Bidirectional(CuDNNGRU(units, return_sequences=True))(x)
    x= Flatten()(x)
    #avg_pool = GlobalAveragePooling1D()(x)
    #max_pool = GlobalMaxPooling1D()(x)
    #conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    ad = Adam(lr=0.005)
    model.compile(loss='binary_crossentropy',
                  optimizer= ad,
                  metrics=['accuracy'])

    return model
model = get_model()

Tensor("spatial_dropout1d_8/cond/Merge:0", shape=(?, 400, 300), dtype=float32)
Tensor("attention_vec_4/transpose:0", shape=(?, 400, 300), dtype=float32)


In [None]:
%%time
batch_size = 700
epochs = 20

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
for i in range(1,epochs+1):
    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    model.save(f'../models/vanilla_Polled_gru_with_attention/{i}epoch.h5')

    y_pred = model.predict(x_test, batch_size=1024)
    submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
    submission.to_csv(f'../submits/vanilla_Polled_gru_with_attention/{i}epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.965598 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.978970 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.978256 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.977298 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.978552 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.974941 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.976094 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

In [4]:
%%time
gru_for_stack = pd.DataFrame.from_dict({'id': train['id'],
                                        "toxic":0,
                                        "severe_toxic":0,
                                        "obscene":0,
                                        "threat":0,
                                        "insult":0,
                                        "identity_hate":0})

batch_size = 800
epochs = 20
cv = KFold(n_splits=10, random_state=0, shuffle=True)
i = 0

for train_index, test_index in cv.split(x_train, y_train):
    X_tra, X_test = x_train[train_index], x_train[test_index]
    y_tra, y_test = y_train[train_index], y_train[test_index]
    i += 1
    
    model = get_model()
    file_path=f"../models/vanilla_gru_cv_10/fold_{i}.best.hdf5"
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

    early = EarlyStopping(monitor="val_loss", mode="min")
    callbacks_list = [checkpoint, early] #early
    #exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
    #steps = int(len(X_tra)/batch_size) * epochs
    #lr_init, lr_fin = 0.001, 0.0005
    #lr_decay = exp_decay(lr_init, lr_fin, steps)
    #K.set_value(model.optimizer.lr, lr_init)
    #K.set_value(model.optimizer.decay, lr_decay)

    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list, validation_data=(X_test, y_test))

    model.load_weights(file_path)

    y_pred = model.predict(X_test, batch_size=1024)
    gru_for_stack.loc[test_index,["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
gru_for_stack.to_csv('../submits/vanilla_gru_cv_10/fold_{i}.csv', index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.04447, saving model to ../models/vanilla_gru_cv_10/fold_1.best.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.04447 to 0.04133, saving model to ../models/vanilla_gru_cv_10/fold_1.best.hdf5
Epoch 3/20

Epoch 00003: val_loss did not improve
Train on 143614 samples, validate on 15957 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.04534, saving model to ../models/vanilla_gru_cv_10/fold_2.best.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.04534 to 0.04236, saving model to ../models/vanilla_gru_cv_10/fold_2.best.hdf5
Epoch 3/20

Epoch 00003: val_loss did not improve
Train on 143614 samples, validate on 15957 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.04242, saving model to ../models/vanilla_gru_cv_10/fold_3.best.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.04242 to 0.04032, saving model to ../models/vanilla_gru_cv_10/fo

In [6]:
gru_for_stack.head()

Unnamed: 0,id,identity_hate,insult,obscene,severe_toxic,threat,toxic
0,0000997932d777bf,2.7e-05,0.000102,9.7e-05,1.5e-05,3.7e-05,0.000405
1,000103f0d9cfb60f,6.2e-05,8.2e-05,0.000109,1.9e-05,1.5e-05,0.000741
2,000113f07ec002fd,6e-05,0.000136,0.000473,7.6e-05,9e-05,0.001095
3,0001b41b1c6bb37e,1.8e-05,5.9e-05,9.4e-05,2.2e-05,1.8e-05,0.000213
4,0001d958c54c6e35,0.0004,0.004207,0.001973,0.000241,0.000584,0.034808


In [9]:
model.load_weights("../models/vanilla_gru_cv_10/fold_1.best.hdf5")
y_pred = model.predict(x_test, batch_size=1024)

In [10]:
%%time
for i in range(2,11):
    model.load_weights(f"../models/vanilla_gru_cv_10/fold_{i}.best.hdf5")
    y_pred *= model.predict(x_test, batch_size=1024)


CPU times: user 2min, sys: 20.7 s, total: 2min 20s
Wall time: 2min 15s


In [11]:
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred ** 0.1
submission.to_csv('../submits/vanilla_gru_cv_10/submitiongeomaverage.csv', index=False)

In [None]:
batch_size = 32
epochs = 1

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)
model.save('../models/vanilla_Polled_GRU_onlypooling/2epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/vanilla_Polled_GRU_onlypooling/2epoch.csv', index=False)

In [8]:
batch_size = 32
epochs = 1

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)
model.save('../models/vanilla_Polled_GRU_onlypooling/2epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/vanilla_Polled_GRU_onlypooling/2epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989550 



In [9]:
batch_size = 32
epochs = 1

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)
model.save('../models/vanilla_Polled_GRU_onlypooling/3epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/vanilla_Polled_GRU_onlypooling/3epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988765 



In [10]:
batch_size = 32
epochs = 1

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)
model.save('../models/vanilla_Polled_GRU_onlypooling/4epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/vanilla_Polled_GRU_onlypooling/4epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988463 

