In [1]:
import pandas as pd
import numpy as np
import datetime
import time 
import os
import gc
import re
import sys
from functools import partial
from gensim.models import Word2Vec

from utils import ignore_warnings, load_data

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%%time
# nrows = 10000
nrows = None
train = load_data('train', nrows=nrows)

CPU times: user 18.7 s, sys: 1.4 s, total: 20.1 s
Wall time: 20.2 s


In [3]:
from clean_session import preprocess_sessions
train = preprocess_sessions(train,data_source='data')

[04-28 11:21:44 - utils - preprocess_sessions - INFO] Cliping session dataframe up to last click out (if there is clickout)
[04-28 11:28:24 - utils - preprocess_sessions - INFO] filtering out sessions without clickouts, reference, or clickout is nan
[04-28 11:28:24 - utils - preprocess_sessions - INFO] data length before filtering: 13,034,626
[04-28 11:44:42 - utils - preprocess_sessions - INFO] data length after filtering: 13,034,626
[04-28 11:44:42 - utils - preprocess_sessions - INFO] Saving ./cache/preprocessed_data.snappy


In [4]:
train.shape

(13034626, 12)

In [5]:
# select the rows that is clickout
is_clickout = train.action_type == 'clickout item'
# # and it is not nan
# not_na = train.re.notna()
# and the impressions are not nans
imp_not_na = train.impressions.notna()
# only select the ones with 25 lens 
train['nimp'] = train.impressions.str.split('|').str.len()
twenty_five = train['nimp'] == 25

select_mask = is_clickout & imp_not_na & twenty_five

In [6]:
train = train[select_mask].reset_index(drop=True)
train.shape

(1232016, 13)

In [7]:
train.loc[train['current_filters'].isna(), 'current_filters'] = 'no_filter'
train.loc[train['reference'].isna(), 'reference'] = 'no_reference'

train['cfs'] = train['current_filters'].str.split('|')
train['imps'] = train['impressions'].str.split('|')

In [8]:
print('before:', train.shape)
train = train[train.reference.notna()].reset_index(drop=True)
print('after:', train.shape)
train = train[train.imps.str.len()==25].reset_index(drop=True)
def assign_target(row):
    ref = row.reference
    imp = row.imps
    if ref in imp:
        return imp.index(ref)
    else:
        return 25
#         return -1
train['target'] = train.apply(assign_target, axis=1)
# remove the target 25 (i.e. not appearing in the list)
print('before:', train.shape)
train = train[train.target != 25].reset_index(drop=True)
print('after:', train.shape)

before: (1232016, 15)
after: (1232016, 15)
before: (1232016, 16)
after: (1231380, 16)


In [9]:
train.head(2)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,nimp,cfs,imps,target
0,WT30CXPIG450,00000510f1adc,1541064087,1,clickout item,7281198,IN,"Ganpatipule, India",desktop,no_filter,2661832|9222426|7051844|4079190|5752778|468398...,46|26|16|38|12|20|21|27|13|21|36|9|144|19|8|19...,25.0,[no_filter],"[2661832, 9222426, 7051844, 4079190, 5752778, ...",6
1,CITFOTN2IT5P,00003f3b20954,1541097696,1,clickout item,979325,ES,"La Manga, Spain",mobile,no_filter,87132|886881|486611|979325|87173|87175|149508|...,330|187|437|159|499|324|476|381|424|159|144|19...,25.0,[no_filter],"[87132, 886881, 486611, 979325, 87173, 87175, ...",3


In [10]:
model = Word2Vec.load('./cache/hotel_2vec/model.bin')

def encoding_depth(imps):
    return np.array([model.wv[i] for i in imps])[None, :, :]

def encoding(imps):
    return np.array([model.wv[i] for i in imps])

def encoding_column(imps):
    return np.array([model.wv[i] for i in imps])[:, :, None]

In [11]:
%%time
train['imps'] = train.imps.apply(encoding)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [12]:
# encode city, platform and device
def categorize(df, cols):
    for col in cols:
        print('converting', col)
        unique_values = df[col].unique()
        mapping = {v: k for k, v in enumerate(unique_values)}
        df[col] = df[col].map(mapping)
        
categorize(train, ['city', 'platform', 'device'])

converting city
converting platform
converting device


In [13]:
train.device.unique()

array([0, 1, 2])

In [14]:
train = pd.get_dummies(train, columns=['device'], drop_first=True)

In [15]:
# train = train[['session_id', 'timestamp', 'reference', 'imps', 'city', 'device', 'platform', 'prices']]

In [16]:
train.shape

(1231380, 17)

In [17]:
train['prices'] = train.prices.str.split('|')
train['prices'] = train['prices'].apply(lambda prices: [int(p) for p in prices])

In [18]:
all_prices = train.prices.values
all_prices = [j for i in all_prices for j in i]

In [19]:
import matplotlib.pyplot as plt
# _ = plt.hist(all_prices, bins=100)

In [20]:
# _ = plt.hist(np.log1p(all_prices), bins=100)

In [21]:
# _ = plt.hist(train_prices, bins=100)

In [22]:
price_mu = np.mean(all_prices)
price_sd = np.std(all_prices)
prices = np.array(list(train.prices.values))
prices = (prices - price_mu)/price_sd
del train['prices']

In [23]:
price_mu

119.15391521707353

In [24]:
impressions = np.array(list(train.imps.values))
del train['imps']

cities = train.city.values
ncity = train.city.nunique()
del train['city']

platforms = train.platform.values
nplat = train.platform.nunique()

del train['platform']
sids = train.session_id.values
del train['session_id']
targets = train.target.values
del train['target']
devices = train[['device_1', 'device_2']].values
del train['device_1'], train['device_2']

In [25]:
from keras.callbacks import Callback
import tensorflow as tf

class TestCallback(Callback):
    def __init__(self, test_data):
        self.test_data = test_data

    def on_epoch_end(self, epoch, logs={}):
        x, y = self.test_data
        loss, acc = self.model.evaluate(x, y, verbose=0)        
        print('Testing loss: {0:.4f}, acc: {1:.4f}'.format(loss, acc))

Using TensorFlow backend.


In [26]:
# import keras.backend as K

# def mrr(y_true, y_pred):
# #     return K.mean(y_pred)
#     y_true_item = K.argmax(y_true, axis=-1)
#     print(y_true_item)
    
#     y_pred_sorted = tf.nn.top_k(input, k=25, sorted=True).indices
#     return K.mean(1/tf.where(y_pred_sorted==y_true))

In [37]:
from keras import optimizers
from keras.layers import concatenate, Dense, Dropout, Embedding, Input, Flatten, Conv1D, BatchNormalization
from keras.models import Model
from keras.callbacks import Callback
# split_per = 0.1
from sklearn.model_selection import StratifiedKFold
from keras import backend as K

K.clear_session()


skf = StratifiedKFold(n_splits=5)

for trn_ind, val_ind in skf.split(targets, targets):
    trn_imp, val_imp = impressions[trn_ind], impressions[val_ind]
    trn_price, val_price = prices[trn_ind], prices[val_ind]
    trn_city, val_city = cities[trn_ind], cities[val_ind]
    trn_plat, val_plat = platforms[trn_ind], platforms[val_ind]
    trn_dev, val_dev = devices[trn_ind], devices[val_ind]
    
    # TEMP
#     del impressions, prices, cities, platforms, devices
#     gc.collect()
    
    y_trn, y_val = targets[trn_ind], targets[val_ind]
    
    # build model
    # impressions
    imp_input = Input(shape=(25, 100))
    imp_conv = Conv1D(16, kernel_size=5, activation='relu')(imp_input)
    imp_conv = BatchNormalization()(imp_conv)
    imp_flatten = Flatten()(imp_conv)
    # city embeddings
    city_input = Input(shape = (1, ), dtype = 'int32')
    city_embedding = Embedding(ncity, 20, input_length=1)(city_input)
    city_embedding = Flatten()(city_embedding)
    
    # platform input 
    plat_input = Input(shape = (1, ), dtype = 'int32')
    plat_embedding = Embedding(nplat, 10, input_length=1)(plat_input)
    plat_embedding = Flatten()(plat_embedding)
                       
    # device
    device_input =  Input(shape = (2, ))
    
    # price input
    price_input =  Input(shape = (25, ))
    
    # concatenate
    concat1 = concatenate([imp_flatten, price_input])
    concat1 = BatchNormalization()(concat1)
    concat1 = Dense(units=30, activation='relu')(concat1)
    concat1 = Dropout(0.2)(concat1)
    concat2 = concatenate([concat1, city_embedding, plat_embedding, device_input])
    concat2 = BatchNormalization()(concat2)
    concat2 = Dropout(0.2)(concat2)
    
    h = Dense(units=30, activation='relu')(concat2)
    output_layer = Dense(25, activation='softmax')(h)


    model = Model(inputs=[imp_input, city_input, plat_input, device_input, price_input], 
                  outputs=output_layer)

    opt = optimizers.Adam(lr=0.001)
    model.compile(optimizer = opt, loss = "categorical_crossentropy", metrics=['accuracy'])

#     model.compile(optimizer = opt, loss = "categorical_crossentropy", metrics=['accuracy', mrr])
# model.compile(optimizer = opt, loss = "sparse_categorical_crossentropy", metrics=['accuracy'])

    print(model.summary())
    
    # from clr import CyclicLR
    from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
    from datetime import datetime as dt
    model_file = 'test.model'

    callbacks = [ModelCheckpoint(model_file, save_best_only=True, verbose=1)]
    # callbacks.append(EarlyStopping(patience=150, verbose=1))
    # callbacks.append(ReduceLROnPlateau(factor=0.5, patience=20, min_lr=5e-4, verbose=1))
    log_dir = "logs/{}".format(dt.now().strftime('%m-%d-%H-%M'))
    # tb = TensorBoard(log_dir=log_dir, histogram_freq=2, write_graph=True, write_grads=True, write_images=True,
    #                  embeddings_freq=10, embeddings_layer_names=['embedding_1'], embeddings_data=next(val_gen))
    tb = TensorBoard(log_dir=log_dir, write_graph=True, write_grads=True)
    callbacks.append(tb)

    
    
    batch_size = 256
    n_epochs = 300
    # keras requires 0, 1 binary label input
    from keras.utils import to_categorical
    train_y_binary = to_categorical(y_trn)
    val_y_binary = to_categorical(y_val)

    history = model.fit([trn_imp, trn_city, trn_plat, trn_dev, trn_price], 
                        train_y_binary, 
                        epochs=n_epochs, 
                        batch_size=batch_size,
                        verbose = 2, 
                        shuffle = True,
                        callbacks=callbacks+[TestCallback(([val_imp, val_city, val_plat, val_dev, val_price],
                                                           val_y_binary))])
    
    # make predictions
    trn_pred = model.predict([trn_imp, trn_city, trn_plat, trn_dev, trn_price])
    trn_mrr = np.mean(1/(np.where(np.argsort(trn_pred)[:, ::-1] == y_trn.reshape(-1, 1))[1]+1))
    
    val_pred = model.predict([val_imp, val_city, val_plat, val_dev, val_price])
    val_mrr = np.mean(1/(np.where(np.argsort(val_pred)[:, ::-1] == y_val.reshape(-1, 1))[1]+1))
    print(f'train mrr: {trn_mrr:.2f} | val mrr: {val_mrr:.2f}')
    
    
    break

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 25, 100)      0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 21, 16)       8016        input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 21, 16)       64          conv1d_1[0][0]                   
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 336)          0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
input_5 (I



Testing loss: 2.5747, acc: 0.3263
Epoch 2/300
 - 21s - loss: 2.5630 - acc: 0.3264
Testing loss: 2.5690, acc: 0.3262
Epoch 3/300
 - 21s - loss: 2.5478 - acc: 0.3268
Testing loss: 2.5680, acc: 0.3257
Epoch 4/300
 - 21s - loss: 2.5363 - acc: 0.3278
Testing loss: 2.5707, acc: 0.3249
Epoch 5/300
 - 21s - loss: 2.5266 - acc: 0.3288
Testing loss: 2.5730, acc: 0.3240
Epoch 6/300
 - 21s - loss: 2.5188 - acc: 0.3297
Testing loss: 2.5735, acc: 0.3232
Epoch 7/300
 - 21s - loss: 2.5130 - acc: 0.3304
Testing loss: 2.5770, acc: 0.3224
Epoch 8/300
 - 21s - loss: 2.5084 - acc: 0.3309
Testing loss: 2.5805, acc: 0.3223
Epoch 9/300
 - 21s - loss: 2.5039 - acc: 0.3312
Testing loss: 2.5813, acc: 0.3220
Epoch 10/300
 - 21s - loss: 2.5001 - acc: 0.3314
Testing loss: 2.5820, acc: 0.3218
Epoch 11/300
 - 21s - loss: 2.4969 - acc: 0.3318
Testing loss: 2.5842, acc: 0.3221
Epoch 12/300
 - 21s - loss: 2.4943 - acc: 0.3320
Testing loss: 2.5844, acc: 0.3221
Epoch 13/300
 - 21s - loss: 2.4919 - acc: 0.3319
Testing loss

Testing loss: 2.6119, acc: 0.3183
Epoch 101/300
 - 21s - loss: 2.4540 - acc: 0.3350
Testing loss: 2.6081, acc: 0.3201
Epoch 102/300
 - 21s - loss: 2.4536 - acc: 0.3349
Testing loss: 2.6123, acc: 0.3198
Epoch 103/300
 - 21s - loss: 2.4535 - acc: 0.3353
Testing loss: 2.6153, acc: 0.3176
Epoch 104/300
 - 21s - loss: 2.4533 - acc: 0.3351
Testing loss: 2.6145, acc: 0.3183
Epoch 105/300
 - 21s - loss: 2.4532 - acc: 0.3351
Testing loss: 2.6138, acc: 0.3190
Epoch 106/300
 - 21s - loss: 2.4534 - acc: 0.3352
Testing loss: 2.6171, acc: 0.3187
Epoch 107/300
 - 21s - loss: 2.4532 - acc: 0.3350
Testing loss: 2.6103, acc: 0.3189
Epoch 108/300
 - 21s - loss: 2.4534 - acc: 0.3348
Testing loss: 2.6140, acc: 0.3187
Epoch 109/300
 - 21s - loss: 2.4534 - acc: 0.3350
Testing loss: 2.6112, acc: 0.3201
Epoch 110/300
 - 21s - loss: 2.4531 - acc: 0.3350
Testing loss: 2.6169, acc: 0.3183
Epoch 111/300
 - 21s - loss: 2.4530 - acc: 0.3350
Testing loss: 2.6090, acc: 0.3189
Epoch 112/300
 - 21s - loss: 2.4529 - acc:

 - 21s - loss: 2.4471 - acc: 0.3358
Testing loss: 2.6181, acc: 0.3176
Epoch 199/300
 - 21s - loss: 2.4471 - acc: 0.3358
Testing loss: 2.6194, acc: 0.3185
Epoch 200/300
 - 21s - loss: 2.4470 - acc: 0.3357
Testing loss: 2.6142, acc: 0.3184
Epoch 201/300
 - 21s - loss: 2.4477 - acc: 0.3357
Testing loss: 2.6183, acc: 0.3173
Epoch 202/300
 - 21s - loss: 2.4472 - acc: 0.3357
Testing loss: 2.6145, acc: 0.3196
Epoch 203/300
 - 21s - loss: 2.4471 - acc: 0.3357
Testing loss: 2.6170, acc: 0.3172
Epoch 204/300
 - 21s - loss: 2.4469 - acc: 0.3357
Testing loss: 2.6175, acc: 0.3177
Epoch 205/300
 - 21s - loss: 2.4470 - acc: 0.3358
Testing loss: 2.6225, acc: 0.3163
Epoch 206/300
 - 21s - loss: 2.4471 - acc: 0.3355
Testing loss: 2.6143, acc: 0.3181
Epoch 207/300
 - 21s - loss: 2.4469 - acc: 0.3358
Testing loss: 2.6153, acc: 0.3183
Epoch 208/300
 - 21s - loss: 2.4465 - acc: 0.3357
Testing loss: 2.6180, acc: 0.3177
Epoch 209/300
 - 21s - loss: 2.4472 - acc: 0.3356
Testing loss: 2.6181, acc: 0.3167
Epoch 

 - 21s - loss: 2.4439 - acc: 0.3360
Testing loss: 2.6237, acc: 0.3181
Epoch 297/300
 - 21s - loss: 2.4440 - acc: 0.3361
Testing loss: 2.6182, acc: 0.3176
Epoch 298/300
 - 21s - loss: 2.4438 - acc: 0.3359
Testing loss: 2.6268, acc: 0.3169
Epoch 299/300
 - 21s - loss: 2.4443 - acc: 0.3363
Testing loss: 2.6215, acc: 0.3181
Epoch 300/300
 - 21s - loss: 2.4440 - acc: 0.3359
Testing loss: 2.6190, acc: 0.3183
train mrr: 0.48 | val mrr: 0.45


In [None]:
# train mrr: 0.47 | val mrr: 0.45