In [1]:
import pandas as pd
import numpy as np
import datetime
import time 
import os
import gc
import re
import sys
from functools import partial

from utils import ignore_warnings, load_data

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%%time
# nrows = 10000
nrows = None
train = load_data('train', nrows=nrows)#, verbose=True)

CPU times: user 18.5 s, sys: 1.37 s, total: 19.9 s
Wall time: 19.9 s


In [3]:
# select the rows that is clickout
is_clickout = train.action_type == 'clickout item'
# # and it is not nan
# not_na = train.re.notna()
# and the impressions are not nans
imp_not_na = train.impressions.notna()
# only select the ones with 25 lens 
train['nimp'] = train.impressions.str.split('|').str.len()
twenty_five = train['nimp'] == 25

select_mask = is_clickout & imp_not_na & twenty_five

In [4]:
train = train[select_mask].reset_index(drop=True)

In [5]:
train.loc[train['current_filters'].isna(), 'current_filters'] = 'no_filter'
train.loc[train['reference'].isna(), 'reference'] = 'no_reference'

train['cfs'] = train['current_filters'].str.split('|')
train['imps'] = train['impressions'].str.split('|')

In [6]:
train = train[['session_id', 'timestamp', 'reference', 'imps']]

In [7]:
train.shape

(1232016, 4)

In [8]:
all_imps = train.imps.values 
all_imps = list(set([j for i in all_imps for j in i]))
# we only embed the ones appeared in the impression list, otherwise uncomment below
# all_imps = list(set(all_imps + list(train['reference'].unique())))
imp2natural = {v: k for k, v in enumerate(all_imps)}
# only select 25 length impressions 
train['reference'] = train.reference.map(imp2natural)
# drop the one reference does not have a mapping
print('before:', train.shape)
train = train[train.reference.notna()].reset_index(drop=True)
print('after:', train.shape)
train = train[train.imps.str.len()==25].reset_index(drop=True)
train['imps'] = train.imps.apply(lambda x: [imp2natural[i] for i in x])
def assign_target(row):
    ref = row.reference
    imp = row.imps
    if ref in imp:
        return imp.index(ref)
    else:
        return 25
#         return -1
train['target'] = train.apply(assign_target, axis=1)
# remove the target 25 (i.e. not appearing in the list)
print('before:', train.shape)
train = train[train.target != 25].reset_index(drop=True)
print('after:', train.shape)


before: (1232016, 4)
after: (1232011, 4)
before: (1232011, 5)
after: (1231380, 5)


In [9]:
len(imp2natural)

743076

In [10]:
train = train[['session_id', 'imps', 'target']]

In [11]:
# split train and val
unique_sids = train['session_id'].unique()
train_sids = unique_sids[:-int(len(unique_sids)*0.1)]

In [12]:
trn_mask = train.session_id.isin(train_sids)
xtrain = train.loc[trn_mask, ['imps', 'target']].reset_index(drop=True)
xval = train.loc[~trn_mask, ['imps', 'target']].reset_index(drop=True)
del train


In [13]:
print(xtrain.shape, xval.shape)

(1108640, 2) (122740, 2)


In [14]:
ytrain = xtrain['target'].values
yval = xval['target'].values
del xtrain['target'], xval['target']

In [19]:
from keras import optimizers
from keras.layers import Activation, concatenate, Dense, Dropout, Embedding, Input, Reshape, Flatten, Conv1D
from keras.models import Model
from keras.callbacks import Callback

n_item_ids = len(imp2natural)
n_embed = 30
input_layer = Input(shape = (25, ), dtype = "int32")
impression_embedding = Embedding(n_item_ids, n_embed, input_length=25)(input_layer)
impression_embedding = Dropout(0.5)(impression_embedding)
# h1 = Conv1D(16, 3)(impression_embedding)
h0 = Flatten()(impression_embedding)
h1 = Dense(units=10, activation='relu')(h0)
h1 = Dropout(0.4)(h1)
# h1 = Flatten()(h1)
output_layer = Dense(25, activation='softmax')(h1)


model = Model(inputs=input_layer, outputs=output_layer)
# opt = optimizers.SGD(lr = 0.001, decay = 1e-6, momentum = 0.9, nesterov = True)
opt = optimizers.Adam(lr=0.002)
model.compile(optimizer = opt, loss = "categorical_crossentropy", metrics=['accuracy'])
# model.compile(optimizer = opt, loss = "sparse_categorical_crossentropy", metrics=['accuracy'])

In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 25)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 25, 30)            22292280  
_________________________________________________________________
dropout_3 (Dropout)          (None, 25, 30)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 750)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                7510      
_________________________________________________________________
dropout_4 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 25)                275       
Total para

In [21]:
# from clr import CyclicLR
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from datetime import datetime as dt
model_file = 'test.model'

callbacks = [ModelCheckpoint(model_file, save_best_only=True, verbose=1)]
# callbacks.append(EarlyStopping(patience=150, verbose=1))
# callbacks.append(ReduceLROnPlateau(factor=0.5, patience=20, min_lr=5e-4, verbose=1))
log_dir = "logs/{}".format(dt.now().strftime('%m-%d-%H-%M'))
# tb = TensorBoard(log_dir=log_dir, histogram_freq=2, write_graph=True, write_grads=True, write_images=True,
#                  embeddings_freq=10, embeddings_layer_names=['embedding_1'], embeddings_data=next(val_gen))
tb = TensorBoard(log_dir=log_dir, write_graph=True, write_grads=True)
callbacks.append(tb)


In [None]:
batch_size = 128
n_epochs = 3000
# keras requires 0, 1 binary label input
from keras.utils import to_categorical
train_y_binary = to_categorical(ytrain)
val_y_binary = to_categorical(yval)

history = model.fit(np.array(xtrain.imps.tolist()), 
                    train_y_binary, 
                    epochs=n_epochs, 
                    batch_size=batch_size,
                    validation_data=(np.array(xval.imps.tolist()), val_y_binary),
                    verbose = 2, 
                    shuffle = True,
                   callbacks=callbacks)

Train on 1108640 samples, validate on 122740 samples
Epoch 1/3000
 - 141s - loss: 2.5620 - acc: 0.3254 - val_loss: 2.5312 - val_acc: 0.3261

Epoch 00001: val_loss improved from inf to 2.53123, saving model to test.model
Epoch 2/3000
 - 138s - loss: 2.4482 - acc: 0.3276 - val_loss: 2.5390 - val_acc: 0.3261

Epoch 00002: val_loss did not improve from 2.53123
Epoch 3/3000
 - 134s - loss: 2.3664 - acc: 0.3309 - val_loss: 2.5675 - val_acc: 0.3258

Epoch 00003: val_loss did not improve from 2.53123
Epoch 4/3000
 - 133s - loss: 2.3115 - acc: 0.3358 - val_loss: 2.5928 - val_acc: 0.3253

Epoch 00004: val_loss did not improve from 2.53123
Epoch 5/3000
 - 133s - loss: 2.2740 - acc: 0.3407 - val_loss: 2.6232 - val_acc: 0.3250

Epoch 00005: val_loss did not improve from 2.53123
Epoch 6/3000
 - 133s - loss: 2.2462 - acc: 0.3448 - val_loss: 2.6389 - val_acc: 0.3233

Epoch 00006: val_loss did not improve from 2.53123
Epoch 7/3000
 - 134s - loss: 2.2251 - acc: 0.3486 - val_loss: 2.6495 - val_acc: 0.322

 - 133s - loss: 2.0635 - acc: 0.3781 - val_loss: 2.7756 - val_acc: 0.3145

Epoch 00059: val_loss did not improve from 2.53123
Epoch 60/3000
 - 133s - loss: 2.0639 - acc: 0.3784 - val_loss: 2.7631 - val_acc: 0.3173

Epoch 00060: val_loss did not improve from 2.53123
Epoch 61/3000
 - 133s - loss: 2.0639 - acc: 0.3782 - val_loss: 2.7627 - val_acc: 0.3177

Epoch 00061: val_loss did not improve from 2.53123
Epoch 62/3000
 - 133s - loss: 2.0632 - acc: 0.3790 - val_loss: 2.7555 - val_acc: 0.3172

Epoch 00062: val_loss did not improve from 2.53123
Epoch 63/3000
 - 133s - loss: 2.0623 - acc: 0.3782 - val_loss: 2.7640 - val_acc: 0.3163

Epoch 00063: val_loss did not improve from 2.53123
Epoch 64/3000
 - 133s - loss: 2.0624 - acc: 0.3788 - val_loss: 2.7612 - val_acc: 0.3176

Epoch 00064: val_loss did not improve from 2.53123
Epoch 65/3000
 - 133s - loss: 2.0617 - acc: 0.3789 - val_loss: 2.7512 - val_acc: 0.3186

Epoch 00065: val_loss did not improve from 2.53123
Epoch 66/3000
 - 133s - loss: 2.06


Epoch 00117: val_loss did not improve from 2.53123
Epoch 118/3000
 - 133s - loss: 2.0526 - acc: 0.3816 - val_loss: 2.7855 - val_acc: 0.3177

Epoch 00118: val_loss did not improve from 2.53123
Epoch 119/3000
 - 133s - loss: 2.0532 - acc: 0.3812 - val_loss: 2.7786 - val_acc: 0.3191

Epoch 00119: val_loss did not improve from 2.53123
Epoch 120/3000
 - 133s - loss: 2.0527 - acc: 0.3815 - val_loss: 2.7753 - val_acc: 0.3181

Epoch 00120: val_loss did not improve from 2.53123
Epoch 121/3000
 - 133s - loss: 2.0522 - acc: 0.3818 - val_loss: 2.7645 - val_acc: 0.3178

Epoch 00121: val_loss did not improve from 2.53123
Epoch 122/3000
 - 133s - loss: 2.0530 - acc: 0.3819 - val_loss: 2.7769 - val_acc: 0.3197

Epoch 00122: val_loss did not improve from 2.53123
Epoch 123/3000
 - 133s - loss: 2.0532 - acc: 0.3817 - val_loss: 2.7678 - val_acc: 0.3196

Epoch 00123: val_loss did not improve from 2.53123
Epoch 124/3000
 - 134s - loss: 2.0535 - acc: 0.3816 - val_loss: 2.7586 - val_acc: 0.3184

Epoch 00124:

 - 133s - loss: 2.0543 - acc: 0.3822 - val_loss: 2.7716 - val_acc: 0.3203

Epoch 00176: val_loss did not improve from 2.53123
Epoch 177/3000
 - 133s - loss: 2.0542 - acc: 0.3826 - val_loss: 2.7903 - val_acc: 0.3178

Epoch 00177: val_loss did not improve from 2.53123
Epoch 178/3000
 - 133s - loss: 2.0535 - acc: 0.3825 - val_loss: 2.7619 - val_acc: 0.3211

Epoch 00178: val_loss did not improve from 2.53123
Epoch 179/3000
 - 133s - loss: 2.0526 - acc: 0.3829 - val_loss: 2.7803 - val_acc: 0.3186

Epoch 00179: val_loss did not improve from 2.53123
Epoch 180/3000
 - 133s - loss: 2.0538 - acc: 0.3827 - val_loss: 2.7857 - val_acc: 0.3214

Epoch 00180: val_loss did not improve from 2.53123
Epoch 181/3000
 - 133s - loss: 2.0545 - acc: 0.3828 - val_loss: 2.7812 - val_acc: 0.3212

Epoch 00181: val_loss did not improve from 2.53123
Epoch 182/3000
 - 133s - loss: 2.0527 - acc: 0.3833 - val_loss: 2.7688 - val_acc: 0.3210

Epoch 00182: val_loss did not improve from 2.53123
Epoch 183/3000
 - 133s - los