In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout
from keras.layers import CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')



EMBEDDING_FILE = '../data/crawl-300d-2M.vec'

train = pd.read_csv('../data/train.csv.zip')
test = pd.read_csv('../data/test.csv.zip')
submission = pd.read_csv('../data/sample_submission.csv.zip')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
max_features = 100000
maxlen = 150
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)


def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index_crawl = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [3]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index_crawl.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [10]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    sequence_input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = Conv1D(128, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(6, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    return model

model = get_model()

In [6]:
%%time
batch_size = 300
epochs = 10

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
for i in range(1,epochs+1):
    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    model.save(f'../models/Polled_gru_with_conv/{i}epoch.h5')

    y_pred = model.predict(x_test, batch_size=1024)
    submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
    submission.to_csv(f'../submits/Polled_gru_with_conv/{i}epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.983590 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988172 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989411 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989388 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989626 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989843 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.990204 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989987 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.990300 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0

In [None]:
%%time
batch_size = 300
epochs = 10

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
for i in range(1,epochs+1):
    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    model.save(f'../models/Polled_gru_with_conv_and_dense/{i}epoch.h5')

    y_pred = model.predict(x_test, batch_size=2048)
    submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
    submission.to_csv(f'../submits/Polled_gru_with_conv_and_dense/{i}epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.977701 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.983930 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.984834 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.986827 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.986717 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988229 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989507 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989258 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

In [9]:
batch_size = 128
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

model.save('../models/Polled_gru_double_GRU_0.4/6epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/Polled_gru_double_GRU_0.4/6epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.987003 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986401 



In [11]:
batch_size = 256
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

model.save('../models/Polled_gru_double_GRU_0.4/8epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/Polled_gru_double_GRU_0.4/8epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.986093 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.985398 



KeyboardInterrupt: 

In [None]:
batch_size = 512
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

model.save('../models/Polled_gru_double_GRU_0.4/10epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/Polled_gru_double_GRU_0.4/10epoch.csv', index=False)