In [1]:
import os
import gc
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import tensorflow as tf
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, LSTM, embeddings
from keras.callbacks import EarlyStopping

from utility_functions import *

Using TensorFlow backend.


# Model: LSTM

Training an LSTM model in Keras. Will be uploaded to Kaggle for use by the inference script

## Parameters

In [2]:
SEQUENCE_PATH = os.path.join(os.pardir, 'Data', 
                             'Sequenced_Text', 'word2vec_train.csv')
TRAIN_PATH = os.path.join(os.pardir, 'Data', 'train.csv')

EMBEDDING_PATH = os.path.join(os.pardir, 'Embedding_Build',
                              'Trained_Embeddings', 'word2vec_keras_embedding.pkl')

SAMPLE_SIZE = None

In [3]:
# Model
LSTM_NODES = 254

# Training 
NUM_FOLDS = 3
RANDOM_SEED = 0
BATCH_SIZE = 2048
MAX_EPOCHS = 5

## Load data

In [4]:
X = pd.read_csv(SEQUENCE_PATH, nrows=SAMPLE_SIZE)
if 'Unnamed: 0' in X.columns:
    X.drop(columns=['Unnamed: 0'], inplace=True)
print('Input data:\n\tn sequences = {}\n\tsequence length = {}'.
      format(X.shape[0], X.shape[1]))
X = X.values

Input data:
	n sequences = 1804874
	sequence length = 100


In [5]:
train = pd.read_csv(TRAIN_PATH, nrows=SAMPLE_SIZE)
print(train.shape)
y = train.pop('target')
del train
gc.collect()

(1804874, 45)


14

In [6]:
# Make binary - right thing to do??
y = (y >= .5).astype(int)

In [7]:
def embedding_as_keras_layer(path):
    embedding = pickle.load(open(path, 'rb'))
    if isinstance(embedding, embeddings.Embedding):
        return embedding
    else:
        raise TypeError('Embedding at {} can\'t be converted to keras layer'.
                        format(path))

## Model build

In [8]:
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [9]:
def create_model():
    model = Sequential()
    embedding = embedding_as_keras_layer(EMBEDDING_PATH)
    model.add(embedding)
    model.add(LSTM(units=LSTM_NODES, 
                   dropout=0.2, 
                   recurrent_dropout=0.2))
    model.add(Dense(units=1, 
                    activation='sigmoid'))
    model.compile(
        loss='binary_crossentropy',
        optimizer='nadam',
        metrics=[auc]
    )
    return model

## Train model

Define CV strategy

In [10]:
cv = StratifiedKFold(NUM_FOLDS, random_state=RANDOM_SEED)

Train in CV loop

In [11]:
comp_metric = []
identities = get_identities()

for fold_no, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train = X[train_idx]
    X_val = X[val_idx]
    y_train = y.iloc[train_idx]
    y_val = y.iloc[val_idx]
    
    model = create_model()
    print('Fitting fold {} of {}'.format(fold_no + 1, NUM_FOLDS))
    result = model.fit(
        X_train,
        y_train,
        validation_data=[X_val, y_val],
        batch_size=BATCH_SIZE,
        epochs=MAX_EPOCHS,
        callbacks=[
            EarlyStopping(
                monitor='val_loss',
                min_delta=0.001,
                patience=3,
                verbose=1
            )
        ]
    )
    
    train = pd.read_csv(TRAIN_PATH, nrows=SAMPLE_SIZE)
    train = train.iloc[val_idx, :]
    train[identities].fillna(0, inplace=True)
    train.loc[:, identities] = train.loc[:, identities].astype(bool)
    
    y_pred = model.predict(X_val)
    bias_metrics_df = compute_bias_metrics_for_model(train, identities, y_val, y_pred)
    final_metric = get_final_metric(bias_metrics_df, roc_auc_score(y_val, y_pred))
    comp_metric.append(final_metric)
    
    print('Best score of fold {}: {:.5f}'.format(fold_no, final_metric))
    
    if fold_no < NUM_FOLDS - 1:
        K.clear_session()
        del model, train
        gc.collect()

Fitting fold 0 of 3
Train on 1203248 samples, validate on 601626 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Best score of fold 0: 0.94994
Fitting fold 1 of 3
Train on 1203250 samples, validate on 601624 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Best score of fold 1: 0.95089
Fitting fold 2 of 3
Train on 1203250 samples, validate on 601624 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Best score of fold 2: 0.94665


In [12]:
result.history

{'val_loss': [0.15212768617153372,
  0.1410643961000951,
  0.13747609232054026,
  0.13653970084635736,
  0.13483899799057975],
 'val_auc': [0.893425086219138,
  0.9174960886669792,
  0.9269087519797613,
  0.9321841462492941,
  0.9356987246816743],
 'loss': [0.18613842613412712,
  0.14758948411410003,
  0.14035621128180356,
  0.1365640114120057,
  0.1336347901522556],
 'auc': [0.7906602386925985,
  0.9085386226547252,
  0.922916608734436,
  0.9299130936254465,
  0.9340788901908814]}

In [15]:
pickle.dump(model, open('Trained_Models/lstm_notebook_model.pkl', 'wb'))