In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model ,Sequential
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, SpatialDropout2D, Dropout, Flatten
from keras.layers import CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import KFold
from keras import regularizers
import warnings

warnings.filterwarnings('ignore')



EMBEDDING_FILE = '../data/crawl-300d-2M.vec'

train = pd.read_csv('../data/train.csv.zip')
test = pd.read_csv('../data/test.csv.zip')
submission = pd.read_csv('../data/sample_submission.csv.zip')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from keras import regularizers

In [3]:
from keras.models import Model ,Sequential

In [4]:
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Reshape, SpatialDropout2D, Dropout, Flatten

In [5]:
max_features = 100000
maxlen = 650
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)


def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [6]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model(units=80):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(units, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    ad = Adam(lr=0.005)
    model.compile(loss='binary_crossentropy',
                  optimizer= ad,
                  metrics=['accuracy'])

    return model
#model = get_model()

In [7]:
def get_pool(unots=80, reg=0.001):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(unots, return_sequences=True))(x)
    
    x = SpatialDropout1D(0.5)(x)
    
    x = Bidirectional(CuDNNGRU(unots, return_sequences=True,return_state=True))(x)    
    avg_pool = GlobalAveragePooling1D()(x[0])
    max_pool = GlobalMaxPooling1D()(x[0])
    conc = concatenate([avg_pool, max_pool,x[1],x[2]])
    #outp = Dropout(0.5)(conc)
    outp = Dense(6, activation="sigmoid")(conc)

    model = Model(inputs=inp, outputs=outp)
    ad = Adam(lr=0.005)
    model.compile(loss='binary_crossentropy',
                      optimizer= ad,
                      metrics=['accuracy'])
    return model

In [8]:
def get(units = 80):
    model = Sequential()
    model.add(Embedding(max_features, embed_size, weights=[embedding_matrix]))
    model.add(Bidirectional(CuDNNGRU(units, return_sequences=True)))
    model.add(SpatialDropout1D(0.5))
    model.add(Bidirectional(CuDNNGRU(units)))
    #model.add(Dropout(0.5))
    model.add(Dense(6, activation="sigmoid"))
    ad = Adam(lr=0.005)
    model.compile(loss='binary_crossentropy',
                      optimizer= ad,
                      metrics=['accuracy'])
    return model

In [15]:
batch_size = 500
epochs = 5
model = get_pool()
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)
#model.save('../models/vanilla_Polled_GRU_onlypooling/2epoch.h5')

#y_pred = model.predict(x_test, batch_size=1024)
#submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
#submission.to_csv('../submits/vanilla_Polled_GRU_onlypooling/2epoch.csv', index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/5

 ROC-AUC - epoch: 1 - score: 0.982955 

Epoch 2/5

 ROC-AUC - epoch: 2 - score: 0.986208 

Epoch 3/5

 ROC-AUC - epoch: 3 - score: 0.984967 

Epoch 4/5
  5000/143613 [>.............................] - ETA: 2:06 - loss: 0.0239 - acc: 0.9907

KeyboardInterrupt: 

In [8]:
batch_size = 300
epochs = 5
model = get_pool()
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/5

 ROC-AUC - epoch: 1 - score: 0.983991 

Epoch 2/5

 ROC-AUC - epoch: 2 - score: 0.986527 

Epoch 3/5

 ROC-AUC - epoch: 3 - score: 0.985882 

Epoch 4/5

 ROC-AUC - epoch: 4 - score: 0.985036 

Epoch 5/5
 12600/143613 [=>............................] - ETA: 2:17 - loss: 0.0233 - acc: 0.9910

KeyboardInterrupt: 

In [None]:
batch_size = 300
epochs = 5
model = get_pool(60)
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/5

 ROC-AUC - epoch: 1 - score: 0.987218 

Epoch 2/5

In [None]:
batch_size = 500
epochs = 5
model = get_pool(40)
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

In [None]:
batch_size = 500
epochs = 5
model = get_pool(25)
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

In [17]:
batch_size = 400
epochs = 5
model = get_pool(40)
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/5

 ROC-AUC - epoch: 1 - score: 0.988462 

Epoch 2/5

 ROC-AUC - epoch: 2 - score: 0.987222 

Epoch 3/5

KeyboardInterrupt: 

In [21]:
batch_size = 400
epochs = 5
model = get_pool(25)
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/5

 ROC-AUC - epoch: 1 - score: 0.983103 

Epoch 2/5

 ROC-AUC - epoch: 2 - score: 0.985528 

Epoch 3/5

 ROC-AUC - epoch: 3 - score: 0.983477 

Epoch 4/5

KeyboardInterrupt: 

In [None]:
batch_size = 300
epochs = 5
model = get_pool(25)
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

In [14]:
batch_size = 400
epochs = 5
#model = get(100)

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/5

 ROC-AUC - epoch: 1 - score: 0.989048 

Epoch 2/5

 ROC-AUC - epoch: 2 - score: 0.988309 

Epoch 3/5

 ROC-AUC - epoch: 3 - score: 0.986570 

Epoch 4/5
 21200/143613 [===>..........................] - ETA: 1:14 - loss: 0.0145 - acc: 0.9947

KeyboardInterrupt: 

In [None]:
batch_size = 500
epochs = 5
model = get(125)

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

In [6]:
gru_for_stack.head()

Unnamed: 0,id,identity_hate,insult,obscene,severe_toxic,threat,toxic
0,0000997932d777bf,2.7e-05,0.000102,9.7e-05,1.5e-05,3.7e-05,0.000405
1,000103f0d9cfb60f,6.2e-05,8.2e-05,0.000109,1.9e-05,1.5e-05,0.000741
2,000113f07ec002fd,6e-05,0.000136,0.000473,7.6e-05,9e-05,0.001095
3,0001b41b1c6bb37e,1.8e-05,5.9e-05,9.4e-05,2.2e-05,1.8e-05,0.000213
4,0001d958c54c6e35,0.0004,0.004207,0.001973,0.000241,0.000584,0.034808


In [9]:
0.01model.load_weights("../models/vanilla_gru_cv_10/fold_1.best.hdf5")
y_pred = model.predict(x_test, batch_size=1024)

In [10]:
%%time
for i in range(2,11):
    model.load_weights(f"../models/vanilla_gru_cv_10/fold_{i}.best.hdf5")
    y_pred *= model.predict(x_test, batch_size=1024)


CPU times: user 2min, sys: 20.7 s, total: 2min 20s
Wall time: 2min 15s


In [11]:
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred ** 0.1
submission.to_csv('../submits/vanilla_gru_cv_10/submitiongeomaverage.csv', index=False)

In [None]:
batch_size = 32
epochs = 1

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)
model.save('../models/vanilla_Polled_GRU_onlypooling/2epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/vanilla_Polled_GRU_onlypooling/2epoch.csv', index=False)

In [8]:
batch_size = 32
epochs = 1

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)
model.save('../models/vanilla_Polled_GRU_onlypooling/2epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/vanilla_Polled_GRU_onlypooling/2epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989550 



In [9]:
batch_size = 32
epochs = 1

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)
model.save('../models/vanilla_Polled_GRU_onlypooling/3epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/vanilla_Polled_GRU_onlypooling/3epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988765 



In [10]:
batch_size = 32
epochs = 1

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)
model.save('../models/vanilla_Polled_GRU_onlypooling/4epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/vanilla_Polled_GRU_onlypooling/4epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988463 

