In [1]:
## In this notebook we will fit our GRU NN model and evaluate our results versus
# a validation set

import numpy as np
np.random.seed(666)
import pandas as pd
import csv

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback, ModelCheckpoint


import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
train = pd.read_csv('inputs/train.csv')
test = pd.read_csv('inputs/test.csv')



X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

max_features = 50000
maxlen = 150
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [30]:
embedding_path = 'inputs/glove.840B.300d.txt'
embeddings = pd.read_table(embedding_path, sep=" ", index_col=0, header=None,
                           quoting=csv.QUOTE_NONE)
emb_mean, emb_std = np.mean(embeddings.values), np.std(embeddings.values)

print(emb_mean)

-0.00583849458846


In [31]:
from datetime import datetime
now = datetime.now()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))

p = 0
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = None
    if (word in embeddings.index) == True: embedding_vector = embeddings.loc[word]
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector.as_matrix()
    p = p + 1
    if p % 5000 == 0:
        print(word)

meetup
retrieve
xyz
pogroms
binomial
gaston
inquiring
engulfed
раз


In [32]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()

    return model

model = get_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 150, 300)     15000000    input_3[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDro (None, 150, 300)     0           embedding_3[0][0]                
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 150, 160)     182880      spatial_dropout1d_3[0][0]        
__________________________________________________________________________________________________
global_ave

In [25]:
def get_callbacks(filepath):
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [msave]



class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [33]:
batch_size = 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, 
                                              random_state=233)



RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
callbacks = get_callbacks(filepath=file_path)


#model.load_weights(filepath=file_path)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, 
                 validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.988144 

Epoch 2/2

KeyboardInterrupt: 

In [34]:
file_path = "Model_W/glove_80_D2_LR001_F50.hdf5"
model.save_weights(filepath=file_path)

In [None]:
y_pred = model.predict(x_test, batch_size=600, verbose=1)

In [16]:
submission = pd.read_csv('inputs/sample_submission.csv')
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [17]:
submission[["toxic", "severe_toxic", "obscene","threat",
                         "insult", "identity_hate"]] = pd.DataFrame(y_pred)
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.992737,0.675259,0.979341,0.067553,0.920044,0.133756
1,0000247867823ef7,0.000337,7e-06,0.000124,1.8e-05,0.000116,1.6e-05
2,00013b17ad220c46,0.001146,2.3e-05,0.000303,1.3e-05,0.00017,4.7e-05
3,00017563c3f7919a,0.000252,5e-06,0.000134,1.1e-05,8.2e-05,7e-06
4,00017695ad8997eb,0.002492,4.4e-05,0.000471,8.1e-05,0.000345,4.1e-05


In [18]:
submission.to_csv("./submissions/glove_v1.csv", index=False)