In [1]:
import pandas as pd
import numpy as np
import datetime
import time 
import os
import gc
import re
import sys
from functools import partial
from gensim.models import Word2Vec

from utils import ignore_warnings, load_data

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%%time
# nrows = 10000
nrows = None
train = load_data('train', nrows=nrows)

CPU times: user 18.7 s, sys: 1.4 s, total: 20.1 s
Wall time: 20.2 s


In [None]:
from clean_session import preprocess_sessions
train = preprocess_sessions(train,data_source='data')

[04-28 11:21:44 - utils - preprocess_sessions - INFO] Cliping session dataframe up to last click out (if there is clickout)


In [None]:
train.shape

In [None]:
# select the rows that is clickout
is_clickout = train.action_type == 'clickout item'
# # and it is not nan
# not_na = train.re.notna()
# and the impressions are not nans
imp_not_na = train.impressions.notna()
# only select the ones with 25 lens 
train['nimp'] = train.impressions.str.split('|').str.len()
twenty_five = train['nimp'] == 25

select_mask = is_clickout & imp_not_na & twenty_five

In [None]:
train = train[select_mask].reset_index(drop=True)
train.shape

In [None]:
train.loc[train['current_filters'].isna(), 'current_filters'] = 'no_filter'
train.loc[train['reference'].isna(), 'reference'] = 'no_reference'

train['cfs'] = train['current_filters'].str.split('|')
train['imps'] = train['impressions'].str.split('|')

In [None]:
print('before:', train.shape)
train = train[train.reference.notna()].reset_index(drop=True)
print('after:', train.shape)
train = train[train.imps.str.len()==25].reset_index(drop=True)
def assign_target(row):
    ref = row.reference
    imp = row.imps
    if ref in imp:
        return imp.index(ref)
    else:
        return 25
#         return -1
train['target'] = train.apply(assign_target, axis=1)
# remove the target 25 (i.e. not appearing in the list)
print('before:', train.shape)
train = train[train.target != 25].reset_index(drop=True)
print('after:', train.shape)

In [None]:
train.head(2)

In [None]:
model = Word2Vec.load('./cache/hotel_2vec/model.bin')

def encoding_depth(imps):
    return np.array([model.wv[i] for i in imps])[None, :, :]

def encoding(imps):
    return np.array([model.wv[i] for i in imps])

def encoding_column(imps):
    return np.array([model.wv[i] for i in imps])[:, :, None]

In [None]:
%time
train['imps'] = train.imps.apply(encoding)

In [None]:
# encode city, platform and device
def categorize(df, cols):
    for col in cols:
        print('converting', col)
        unique_values = df[col].unique()
        mapping = {v: k for k, v in enumerate(unique_values)}
        df[col] = df[col].map(mapping)
        
categorize(train, ['city', 'platform', 'device'])

In [None]:
train.device.unique()

In [None]:
train = pd.get_dummies(train, columns=['device'], drop_first=True)

In [None]:
# train = train[['session_id', 'timestamp', 'reference', 'imps', 'city', 'device', 'platform', 'prices']]

In [None]:
train.shape

In [None]:
train['prices'] = train.prices.str.split('|')
train['prices'] = train['prices'].apply(lambda prices: [int(p) for p in prices])

In [None]:
all_prices = train.prices.values
all_prices = [j for i in all_prices for j in i]

In [None]:
import matplotlib.pyplot as plt
# _ = plt.hist(all_prices, bins=100)

In [None]:
# _ = plt.hist(np.log1p(all_prices), bins=100)

In [None]:
# _ = plt.hist(train_prices, bins=100)

In [None]:
price_mu = np.mean(all_prices)
price_sd = np.std(all_prices)
prices = np.array(list(train.prices.values))
prices = (prices - price_mu)/price_sd
del train['prices']

In [None]:
price_mu

In [None]:
impressions = np.array(list(train.imps.values))
del train['imps']

cities = train.city.values
ncity = train.city.nunique()
del train['city']

platforms = train.platform.values
nplat = train.platform.nunique()

del train['platform']
sids = train.session_id.values
del train['session_id']
targets = train.target.values
del train['target']
devices = train[['device_1', 'device_2']].values
del train['device_1'], train['device_2']

In [None]:
from keras.callbacks import Callback
import tensorflow as tf

class TestCallback(Callback):
    def __init__(self, test_data):
        self.test_data = test_data

    def on_epoch_end(self, epoch, logs={}):
        x, y = self.test_data
        loss, acc = self.model.evaluate(x, y, verbose=0)        
        print('Testing loss: {0:.4f}, acc: {1:.4f}'.format(loss, acc))

In [None]:
# import keras.backend as K

# def mrr(y_true, y_pred):
# #     return K.mean(y_pred)
#     y_true_item = K.argmax(y_true, axis=-1)
#     print(y_true_item)
    
#     y_pred_sorted = tf.nn.top_k(input, k=25, sorted=True).indices
#     return K.mean(1/tf.where(y_pred_sorted==y_true))

In [None]:
from keras import optimizers
from keras.layers import Activation, concatenate, Dense, Dropout, Embedding, Input, Reshape, Flatten, Conv1D
from keras.models import Model
from keras.callbacks import Callback
# split_per = 0.1
from sklearn.model_selection import StratifiedKFold
from keras import backend as K

K.clear_session()


skf = StratifiedKFold(n_splits=5)

for trn_ind, val_ind in skf.split(targets, targets):
    trn_imp, val_imp = impressions[trn_ind], impressions[val_ind]
    trn_price, val_price = prices[trn_ind], prices[val_ind]
    trn_city, val_city = cities[trn_ind], cities[val_ind]
    trn_plat, val_plat = platforms[trn_ind], platforms[val_ind]
    trn_dev, val_dev = devices[trn_ind], devices[val_ind]
    
    y_trn, y_val = targets[trn_ind], targets[val_ind]
    
    # build model
    # impressions
    imp_input = Input(shape=(25, 100))
    imp_conv = Conv1D(16, kernel_size=3, activation='relu')(imp_input)
    imp_flatten = Flatten()(imp_conv)
    # city embeddings
    city_input = Input(shape = (1, ), dtype = 'int32')
    city_embedding = Embedding(ncity, 20, input_length=1)(city_input)
    city_embedding = Flatten()(city_embedding)
    
    # platform input 
    plat_input = Input(shape = (1, ), dtype = 'int32')
    plat_embedding = Embedding(nplat, 10, input_length=1)(plat_input)
    plat_embedding = Flatten()(plat_embedding)
                       
    # device
    device_input =  Input(shape = (2, ))
    
    # price input
    price_input =  Input(shape = (25, ))
    
    # concatenate
    concat1 = concatenate([imp_flatten, price_input])
    concat1 = Dense(units=30, activation='relu')(concat1)
    concat1 = Dropout(0.2)(concat1)
    concat2 = concatenate([concat1, city_embedding, plat_embedding, device_input])
    concat2 = Dropout(0.2)(concat2)
    
    h = Dense(units=30, activation='relu')(concat2)
    output_layer = Dense(25, activation='softmax')(h)


    model = Model(inputs=[imp_input, city_input, plat_input, device_input, price_input], 
                  outputs=output_layer)

    opt = optimizers.Adam(lr=0.001)
    model.compile(optimizer = opt, loss = "categorical_crossentropy", metrics=['accuracy'])

#     model.compile(optimizer = opt, loss = "categorical_crossentropy", metrics=['accuracy', mrr])
# model.compile(optimizer = opt, loss = "sparse_categorical_crossentropy", metrics=['accuracy'])

    print(model.summary())
    
    # from clr import CyclicLR
    from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
    from datetime import datetime as dt
    model_file = 'test.model'

    callbacks = [ModelCheckpoint(model_file, save_best_only=True, verbose=1)]
    # callbacks.append(EarlyStopping(patience=150, verbose=1))
    # callbacks.append(ReduceLROnPlateau(factor=0.5, patience=20, min_lr=5e-4, verbose=1))
    log_dir = "logs/{}".format(dt.now().strftime('%m-%d-%H-%M'))
    # tb = TensorBoard(log_dir=log_dir, histogram_freq=2, write_graph=True, write_grads=True, write_images=True,
    #                  embeddings_freq=10, embeddings_layer_names=['embedding_1'], embeddings_data=next(val_gen))
    tb = TensorBoard(log_dir=log_dir, write_graph=True, write_grads=True)
    callbacks.append(tb)

    
    
    batch_size = 128
    n_epochs = 30
    # keras requires 0, 1 binary label input
    from keras.utils import to_categorical
    train_y_binary = to_categorical(y_trn)
    val_y_binary = to_categorical(y_val)

    history = model.fit([trn_imp, trn_city, trn_plat, trn_dev, trn_price], 
                        train_y_binary, 
                        epochs=n_epochs, 
                        batch_size=batch_size,
                        verbose = 2, 
                        shuffle = True,
                        callbacks=callbacks+[TestCallback(([val_imp, val_city, val_plat, val_dev, val_price],
                                                           val_y_binary))])
    
    # make predictions
    trn_pred = model.predict([trn_imp, trn_city, trn_plat, trn_dev, trn_price])
    trn_mrr = np.mean(1/(np.where(np.argsort(trn_pred)[:, ::-1] == y_trn.reshape(-1, 1))[1]+1))
    
    val_pred = model.predict([val_imp, val_city, val_plat, val_dev, val_price])
    val_mrr = np.mean(1/(np.where(np.argsort(val_pred)[:, ::-1] == y_val.reshape(-1, 1))[1]+1))
    print(f'train mrr: {trn_mrr:.2f} | val mrr: {val_mrr:.2f}')
    
    
    break

In [None]:
np.argsort(val_pred)[:, ::-1]

In [None]:
np.mean(1/(np.where(np.argsort(val_pred)[:, ::-1] == y_val.reshape(-1, 1))[1]+1))

In [None]:
y_val.max()

In [None]:
?np.where

In [None]:
np.where(np.argsort(val_pred)[:, ::-1] == y_val.reshape(-1, 1))[1]

In [None]:
np.argsort(val_pred)[:, ::-1][0]

In [None]:
y_val

In [None]:
np.where(np.argsort(val_pred)[:, ::-1][0]==y_val.reshape(-1, 1))[1][0]