In [15]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, CuDNNLSTM, CuDNNGRU, GRU, Embedding, Dropout, Activation, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks

import tcc_data_preparation as tdp
from tcc_config import TccConfig

In [16]:
train_data = tdp.load_data(TccConfig.TRAIN_FILE)
test_data = tdp.load_data(TccConfig.COMP_FILE)

list_sentences_train = train_data["comment_text"].fillna(TccConfig.EMPTY).values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_data[list_classes].values
list_sentences_test = test_data["comment_text"].fillna(TccConfig.EMPTY).values

In [3]:
tokenizer = Tokenizer(num_words=TccConfig.VOCAB_SIZE)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=TccConfig.MAX_LEN)
X_te = pad_sequences(list_tokenized_test, maxlen=TccConfig.MAX_LEN)

In [4]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) 
                        for o in open(os.path.join(TccConfig.INPUT_DIR, TccConfig.EMBEDDING_FILE)))

In [5]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

In [6]:
word_index = tokenizer.word_index
nb_words = min(TccConfig.VOCAB_SIZE, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, TccConfig.EMBED_DIMS))
for word, i in word_index.items():
    if i >= TccConfig.VOCAB_SIZE: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

ROC AUC for CV in Keras see for details: https://gist.github.com/smly/d29d079100f8d81b905e

In [7]:
import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
from sklearn.model_selection import train_test_split

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch, score))


Bidirectional LSTM with half-size embedding with two fully connected layers

In [8]:
inp = Input(shape=(TccConfig.MAX_LEN,))
x = Embedding(TccConfig.VOCAB_SIZE, TccConfig.EMBED_DIMS, weights=[embedding_matrix], trainable=True)(inp)
x = Bidirectional(CuDNNLSTM(50, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = BatchNormalization()(x)
x = Dense(50, activation="relu")(x)
#x = BatchNormalization()(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)

import keras.backend as K
def loss(y_true, y_pred):
     return K.binary_crossentropy(y_true, y_pred)
    
model.compile(loss=loss, optimizer='nadam', metrics=['accuracy'])

Now we're ready to fit out model! Use `validation_split` when for hyperparams tuning

In [9]:

def schedule(ind):
    a = [0.002,0.003, 0.000]
    return a[ind]
lr = callbacks.LearningRateScheduler(schedule)
[X_train, X_val, y_train, y_val] = train_test_split(X_t, y, train_size=0.95)

ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

model.fit(X_train, y_train, batch_size=64, epochs=3, validation_data=(X_val, y_val), callbacks=[lr, ra_val])
#model.fit(X_t, y, batch_size=64, epochs=3, callbacks=[lr])



Train on 151592 samples, validate on 7979 samples
Epoch 1/3
 ROC-AUC - epoch: 0 - score: 0.982002
Epoch 2/3
 ROC-AUC - epoch: 1 - score: 0.982788
Epoch 3/3
 ROC-AUC - epoch: 2 - score: 0.983533


<keras.callbacks.History at 0x7f55f0c186d8>

And finally, get predictions for the test set and prepare a submission CSV:

In [10]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)



In [12]:
#sub_data = tdp.load_data(TccConfig.SUB_FILE)

sub_data = pd.read_csv(filepath_or_buffer=os.path.join(TccConfig.INPUT_DIR, TccConfig.SUB_FILE), 
                       sep=',', header=0, index_col=None)


sub_data[list_classes] = y_test

#test_data.to_csv('submission.csv', index=False)

In [13]:
sub_data

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.997731,2.468449e-01,0.972719,6.654608e-02,0.880693,1.145393e-01
1,0000247867823ef7,0.000190,2.367500e-09,0.000039,6.872422e-08,0.000020,3.302660e-06
2,00013b17ad220c46,0.000194,2.602408e-09,0.000011,2.587083e-07,0.000011,2.145531e-06
3,00017563c3f7919a,0.001626,8.106312e-09,0.000117,3.899091e-07,0.000166,4.575241e-06
4,00017695ad8997eb,0.001889,4.746978e-08,0.000170,2.122937e-06,0.000155,1.199651e-05
5,0001ea8717f6de06,0.001724,2.523314e-08,0.000140,5.362678e-06,0.000162,2.092952e-05
6,00024115d4cbde0f,0.012364,4.646811e-07,0.000612,1.256447e-05,0.002004,4.047031e-05
7,000247e83dcc1211,0.470873,1.416247e-04,0.015386,9.666924e-05,0.063967,2.309555e-03
8,00025358d4737918,0.017824,4.428366e-06,0.002185,6.219179e-05,0.003907,4.191853e-05
9,00026d1092fe71cc,0.008214,2.171432e-07,0.002090,3.474956e-06,0.000906,2.129035e-05


In [14]:
sub_data.to_csv('submission.csv', index=False)

In [19]:
test_data["comment_text"]

0         Yo bitch Ja Rule is more succesful then you'll...
1         == From RfC == \n\n The title is fine as it is...
2         " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3         :If you have a look back at the source, the in...
4                 I don't anonymously edit articles at all.
5         Thank you for understanding. I think very high...
6         Please do not add nonsense to Wikipedia. Such ...
7                          :Dear god this site is horrible.
8         " \n Only a fool can believe in such numbers. ...
9         == Double Redirects == \n\n When fixing double...
10        I think its crap that the link to roggenbier i...
11        "::: Somebody will invariably try to add Relig...
12        , 25 February 2010 (UTC) \n\n :::Looking it ov...
13        " \n\n It says it right there that it IS a typ...
14        " \n\n == Before adding a new product to the l...
15        ==Current Position== \n Anyone have confirmati...
16                                 this 

In [None]:
["comment_text"]