In [1]:
import pandas as pd
import numpy as np
import datetime
import time 
import os
import gc
import re
import sys
from functools import partial
from utils import ignore_warnings, load_data
from gensim.models import Word2Vec

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%%time
# nrows = 1000000
nrows = None
train = load_data('train', nrows=nrows)#, verbose=True)

CPU times: user 18.6 s, sys: 1.49 s, total: 20.1 s
Wall time: 20.1 s


In [3]:
# select the rows that is clickout
is_clickout = train.action_type == 'clickout item'
# # and it is not nan
# not_na = train.re.notna()
# and the impressions are not nans
imp_not_na = train.impressions.notna()
# only select the ones with 25 lens 
train['nimp'] = train.impressions.str.split('|').str.len()
twenty_five = train['nimp'] == 25

select_mask = is_clickout & imp_not_na & twenty_five

In [4]:
train = train[select_mask].reset_index(drop=True)

In [5]:
train.loc[train['current_filters'].isna(), 'current_filters'] = 'no_filter'
train.loc[train['reference'].isna(), 'reference'] = 'no_reference'

train['cfs'] = train['current_filters'].str.split('|')
train['imps'] = train['impressions'].str.split('|')

In [6]:
train = train[['session_id', 'timestamp', 'reference', 'imps']]

In [7]:
train.shape

(1232016, 4)

In [8]:
def assign_target(row):
    ref = row.reference
    imp = row.imps
    if ref in imp:
        return imp.index(ref)
    else:
        return 25
#         return -1
train['target'] = train.apply(assign_target, axis=1)
# remove the target 25 (i.e. not appearing in the list)
print('before:', train.shape)
train = train[train.target != 25].reset_index(drop=True)
print('after:', train.shape)

before: (1232016, 5)
after: (1231380, 5)


In [9]:
train.head()

Unnamed: 0,session_id,timestamp,reference,imps,target
0,aff3928535f48,1541037543,109038,"[3400638, 1253714, 3367857, 5100540, 1088584, ...",9
1,aff3928535f48,1541038485,1257342,"[55109, 129343, 54824, 2297972, 109014, 125734...",5
2,3599a6f709eab,1541063864,2795374,"[2795374, 5582964, 1088390, 2781070, 1258068, ...",0
3,ec139e10b9238,1541100322,1032816,"[12693, 46363, 81657, 18448, 47687, 152913, 18...",15
4,ec139e10b9238,1541100652,1032816,"[12693, 46363, 81657, 18448, 47687, 152913, 18...",15


In [10]:
model = Word2Vec.load('./model.bin')

def encoding_depth(imps):
    return np.array([model[i] for i in imps])[None, :, :]

def encoding(imps):
    return np.array([model[i] for i in imps])

def encoding_column(imps):
    return np.array([model[i] for i in imps])[:, :, None]

In [11]:
%%time
train['imps'] = train.imps.apply(encoding)

  import sys


CPU times: user 1min 40s, sys: 2.37 s, total: 1min 42s
Wall time: 1min 42s


In [12]:
# all_imps = train.imps.values 
# all_imps = list(set([j for i in all_imps for j in i]))
# # we only embed the ones appeared in the impression list, otherwise uncomment below
# # all_imps = list(set(all_imps + list(train['reference'].unique())))
# imp2natural = {v: k for k, v in enumerate(all_imps)}
# # only select 25 length impressions 
# train['reference'] = train.reference.map(imp2natural)
# # drop the one reference does not have a mapping
# print('before:', train.shape)
# train = train[train.reference.notna()].reset_index(drop=True)
# print('after:', train.shape)
# train = train[train.imps.str.len()==25].reset_index(drop=True)
# train['imps'] = train.imps.apply(lambda x: [imp2natural[i] for i in x])
# def assign_target(row):
#     ref = row.reference
#     imp = row.imps
#     if ref in imp:
#         return imp.index(ref)
#     else:
#         return 25
# #         return -1
# train['target'] = train.apply(assign_target, axis=1)
# # remove the target 25 (i.e. not appearing in the list)
# print('before:', train.shape)
# train = train[train.target != 25].reset_index(drop=True)
# print('after:', train.shape)


In [13]:
train.imps[0].shape

(25, 100)

In [14]:
# len(imp2natural)

In [15]:
train = train[['session_id', 'imps', 'target']]

In [16]:
# split train and val
unique_sids = train['session_id'].unique()
train_sids = unique_sids[:-int(len(unique_sids)*0.1)]

In [17]:
trn_mask = train.session_id.isin(train_sids)
xtrain = train.loc[trn_mask, ['imps', 'target']].reset_index(drop=True)
xval = train.loc[~trn_mask, ['imps', 'target']].reset_index(drop=True)
del train

In [18]:
print(xtrain.shape, xval.shape)

(1108640, 2) (122740, 2)


In [19]:
ytrain = xtrain['target'].values
yval = xval['target'].values
del xtrain['target'], xval['target']

In [20]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Conv2D, Flatten, Dropout, BatchNormalization
#create model
model = Sequential()
#add model layers
# model.add(Conv2D(64, kernel_size=3, activation='relu', input_shape=(1, 25, 100)))
# model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(Conv1D(16, kernel_size=3, activation='relu', input_shape=(25, 100)))
model.add(BatchNormalization())
model.add(Conv1D(32, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(32, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(32, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(25, activation='softmax'))

Using TensorFlow backend.


In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 23, 16)            4816      
_________________________________________________________________
batch_normalization_1 (Batch (None, 23, 16)            64        
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 21, 32)            1568      
_________________________________________________________________
batch_normalization_2 (Batch (None, 21, 32)            128       
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 19, 32)            3104      
_________________________________________________________________
batch_normalization_3 (Batch (None, 19, 32)            128       
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 17, 32)            3104      
__________

In [22]:
len(xtrain)

1108640

In [23]:
from keras import optimizers
# opt = optimizers.SGD(lr = 0.001, decay = 1e-6, momentum = 0.9, nesterov = True)
opt = optimizers.Adam(lr=0.001)
model.compile(optimizer = opt, loss = "categorical_crossentropy", metrics=['accuracy'])
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [24]:
# from clr import CyclicLR
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from datetime import datetime as dt
model_file = 'conv1d.model'

callbacks = [ModelCheckpoint(model_file, save_best_only=True, verbose=1)]
callbacks.append(EarlyStopping(patience=100, verbose=1))
callbacks.append(ReduceLROnPlateau(factor=0.5, patience=50, min_lr=5e-4, verbose=1))
log_dir = "logs/{}".format(dt.now().strftime('%m-%d-%H-%M'))
# tb = TensorBoard(log_dir=log_dir, histogram_freq=2, write_graph=True, write_grads=True, write_images=True,
#                  embeddings_freq=10, embeddings_layer_names=['embedding_1'], embeddings_data=next(val_gen))
tb = TensorBoard(log_dir=log_dir, write_graph=True, write_grads=True)
callbacks.append(tb)


In [25]:
batch_size = 128
n_epochs = 3000
# keras requires 0, 1 binary label input
from keras.utils import to_categorical
train_y_binary = to_categorical(ytrain)
val_y_binary = to_categorical(yval)

In [26]:
assert train_y_binary.shape[1] == val_y_binary.shape[1]

In [27]:
t_int = time.time()
history = model.fit(np.array(xtrain.imps.tolist()), 
                    train_y_binary, 
                    epochs=n_epochs, 
                    batch_size=batch_size,
                    validation_data=(np.array(xval.imps.tolist()), val_y_binary),
                    verbose = 2, 
                    shuffle = True,
                   callbacks=callbacks)
print(f'total time took: {(time.time()-t_int)/60:.2f}mins')

Train on 1108640 samples, validate on 122740 samples
Epoch 1/3000
 - 48s - loss: 2.6485 - acc: 0.3218 - val_loss: 2.5961 - val_acc: 0.3261

Epoch 00001: val_loss improved from inf to 2.59611, saving model to conv1d.model
Epoch 2/3000
 - 46s - loss: 2.5930 - acc: 0.3263 - val_loss: 2.5882 - val_acc: 0.3261

Epoch 00002: val_loss improved from 2.59611 to 2.58817, saving model to conv1d.model
Epoch 3/3000
 - 45s - loss: 2.5859 - acc: 0.3263 - val_loss: 2.5836 - val_acc: 0.3261

Epoch 00003: val_loss improved from 2.58817 to 2.58355, saving model to conv1d.model
Epoch 4/3000
 - 45s - loss: 2.5823 - acc: 0.3263 - val_loss: 2.5818 - val_acc: 0.3261

Epoch 00004: val_loss improved from 2.58355 to 2.58183, saving model to conv1d.model
Epoch 5/3000
 - 46s - loss: 2.5794 - acc: 0.3263 - val_loss: 2.5793 - val_acc: 0.3261

Epoch 00005: val_loss improved from 2.58183 to 2.57928, saving model to conv1d.model
Epoch 6/3000
 - 45s - loss: 2.5775 - acc: 0.3263 - val_loss: 2.5777 - val_acc: 0.3260

Epoc


Epoch 00055: val_loss improved from 2.56905 to 2.56808, saving model to conv1d.model
Epoch 56/3000
 - 45s - loss: 2.5551 - acc: 0.3263 - val_loss: 2.5716 - val_acc: 0.3260

Epoch 00056: val_loss did not improve from 2.56808
Epoch 57/3000
 - 45s - loss: 2.5551 - acc: 0.3263 - val_loss: 2.5701 - val_acc: 0.3259

Epoch 00057: val_loss did not improve from 2.56808
Epoch 58/3000
 - 45s - loss: 2.5549 - acc: 0.3263 - val_loss: 2.5675 - val_acc: 0.3259

Epoch 00058: val_loss improved from 2.56808 to 2.56753, saving model to conv1d.model
Epoch 59/3000
 - 45s - loss: 2.5548 - acc: 0.3263 - val_loss: 2.5695 - val_acc: 0.3259

Epoch 00059: val_loss did not improve from 2.56753
Epoch 60/3000
 - 45s - loss: 2.5548 - acc: 0.3263 - val_loss: 2.5687 - val_acc: 0.3260

Epoch 00060: val_loss did not improve from 2.56753
Epoch 61/3000
 - 45s - loss: 2.5545 - acc: 0.3263 - val_loss: 2.5687 - val_acc: 0.3261

Epoch 00061: val_loss did not improve from 2.56753
Epoch 62/3000
 - 45s - loss: 2.5546 - acc: 0.3

 - 45s - loss: 2.5497 - acc: 0.3263 - val_loss: 2.5686 - val_acc: 0.3261

Epoch 00114: val_loss did not improve from 2.56716
Epoch 115/3000
 - 45s - loss: 2.5494 - acc: 0.3264 - val_loss: 2.5697 - val_acc: 0.3263

Epoch 00115: val_loss did not improve from 2.56716
Epoch 116/3000
 - 45s - loss: 2.5495 - acc: 0.3263 - val_loss: 2.5687 - val_acc: 0.3260

Epoch 00116: val_loss did not improve from 2.56716
Epoch 117/3000
 - 46s - loss: 2.5494 - acc: 0.3263 - val_loss: 2.5694 - val_acc: 0.3260

Epoch 00117: val_loss did not improve from 2.56716
Epoch 118/3000
 - 45s - loss: 2.5494 - acc: 0.3263 - val_loss: 2.5688 - val_acc: 0.3260

Epoch 00118: val_loss did not improve from 2.56716
Epoch 119/3000
 - 45s - loss: 2.5491 - acc: 0.3263 - val_loss: 2.5682 - val_acc: 0.3261

Epoch 00119: val_loss did not improve from 2.56716
Epoch 120/3000
 - 45s - loss: 2.5491 - acc: 0.3263 - val_loss: 2.5674 - val_acc: 0.3258

Epoch 00120: val_loss did not improve from 2.56716
Epoch 121/3000
 - 45s - loss: 2.549


Epoch 00171: val_loss did not improve from 2.56639
Epoch 172/3000
 - 45s - loss: 2.5427 - acc: 0.3263 - val_loss: 2.5699 - val_acc: 0.3258

Epoch 00172: val_loss did not improve from 2.56639
Epoch 173/3000
 - 45s - loss: 2.5426 - acc: 0.3264 - val_loss: 2.5686 - val_acc: 0.3257

Epoch 00173: val_loss did not improve from 2.56639
Epoch 174/3000
 - 45s - loss: 2.5430 - acc: 0.3263 - val_loss: 2.5682 - val_acc: 0.3254

Epoch 00174: val_loss did not improve from 2.56639
Epoch 175/3000
 - 45s - loss: 2.5430 - acc: 0.3263 - val_loss: 2.5676 - val_acc: 0.3261

Epoch 00175: val_loss did not improve from 2.56639
Epoch 176/3000
 - 45s - loss: 2.5427 - acc: 0.3264 - val_loss: 2.5685 - val_acc: 0.3258

Epoch 00176: val_loss did not improve from 2.56639
Epoch 177/3000
 - 45s - loss: 2.5428 - acc: 0.3263 - val_loss: 2.5683 - val_acc: 0.3257

Epoch 00177: val_loss did not improve from 2.56639
Epoch 178/3000
 - 45s - loss: 2.5425 - acc: 0.3263 - val_loss: 2.5687 - val_acc: 0.3252

Epoch 00178: val_lo

 - 45s - loss: 2.5415 - acc: 0.3263 - val_loss: 2.5673 - val_acc: 0.3258

Epoch 00230: val_loss did not improve from 2.56597
Epoch 231/3000
 - 45s - loss: 2.5417 - acc: 0.3264 - val_loss: 2.5687 - val_acc: 0.3261

Epoch 00231: val_loss did not improve from 2.56597

Epoch 00231: ReduceLROnPlateau reducing learning rate to 0.0005.
Epoch 232/3000
 - 45s - loss: 2.5418 - acc: 0.3263 - val_loss: 2.5674 - val_acc: 0.3259

Epoch 00232: val_loss did not improve from 2.56597
Epoch 233/3000
 - 45s - loss: 2.5415 - acc: 0.3263 - val_loss: 2.5690 - val_acc: 0.3253

Epoch 00233: val_loss did not improve from 2.56597
Epoch 234/3000
 - 45s - loss: 2.5415 - acc: 0.3263 - val_loss: 2.5681 - val_acc: 0.3256

Epoch 00234: val_loss did not improve from 2.56597
Epoch 235/3000
 - 45s - loss: 2.5416 - acc: 0.3264 - val_loss: 2.5679 - val_acc: 0.3258

Epoch 00235: val_loss did not improve from 2.56597
Epoch 236/3000
 - 45s - loss: 2.5418 - acc: 0.3262 - val_loss: 2.5670 - val_acc: 0.3260

Epoch 00236: val_los