In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Dropout, Activation, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks

import data_preparation as mdp
from data_preparation import MerLConfig

Using TensorFlow backend.


In [2]:
train_data = mdp.load_data(MerLConfig.TRAIN_PREP_FILE, sep=',')

list_sentences_train = train_data["item_description"].values

Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).

In [3]:
tokenizer = Tokenizer(num_words=MerLConfig.MAX_WORDS_FROM_INDEX_ITEM_DESC)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_t = pad_sequences(list_tokenized_train, maxlen=MerLConfig.MAX_WORDS_IN_ITEM_DESC)

In [4]:
X_t.shape

(266856, 300)

Read the glove word vectors (space delimited strings) into a dictionary from word->vector.

In [5]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) 
                        for o in open(os.path.join(MerLConfig.INPUT_DIR, MerLConfig.EMBED_FILE)))

Use these vectors to create our embedding matrix, with random initialization for words that aren't in GloVe. We'll use the same mean and stdev of embeddings the GloVe has when generating the random init.

In [6]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940479, 0.64410418)

In [7]:
word_index = tokenizer.word_index
nb_words = min(MerLConfig.MAX_WORDS_FROM_INDEX_ITEM_DESC, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, MerLConfig.WORD_EMBED_DIMS))
for word, i in word_index.items():
    if i >= MerLConfig.MAX_WORDS_FROM_INDEX_ITEM_DESC: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

ROC AUC for CV in Keras see for details: https://gist.github.com/smly/d29d079100f8d81b905e

In [9]:
import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
from sklearn.model_selection import train_test_split

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch, score))


Bidirectional LSTM with half-size embedding with two fully connected layers

In [12]:
inp = Input(shape=(TccConfig.MAX_LEN,))
x = Embedding(TccConfig.VOCAB_SIZE, TccConfig.EMBED_DIMS, weights=[embedding_matrix], trainable=True)(inp)
x = Bidirectional(LSTM(50, return_sequences=True,dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = BatchNormalization()(x)
x = Dense(50, activation="relu")(x)
#x = BatchNormalization()(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)

import keras.backend as K
def loss(y_true, y_pred):
     return K.binary_crossentropy(y_true, y_pred)
    
model.compile(loss=loss, optimizer='nadam', metrics=['accuracy'])

Now we're ready to fit out model! Use `validation_split` when for hyperparams tuning

In [13]:

def schedule(ind):
    a = [0.002,0.003, 0.000]
    return a[ind]
lr = callbacks.LearningRateScheduler(schedule)
[X_train, X_val, y_train, y_val] = train_test_split(X_t, y, train_size=0.95)

ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

model.fit(X_train, y_train, batch_size=64, epochs=3, validation_data=(X_val, y_val), callbacks=[lr, ra_val])
#model.fit(X_t, y, batch_size=64, epochs=3, callbacks=[lr])



Train on 151592 samples, validate on 7979 samples
Epoch 1/3

KeyboardInterrupt: 

And finally, get predictions for the test set and prepare a submission CSV:

In [14]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(f'{path}{comp}sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('LSTM-submission.csv', index=False)

