In [None]:
import pandas as pd
import numpy as np
import datetime
import time 
import os
import gc
import re
import sys
from functools import partial

from utils import ignore_warnings, load_data

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
%%time
# nrows = 10000
nrows = None
train = load_data('train', nrows=nrows)#, verbose=True)

In [None]:
# select the rows that is clickout
is_clickout = train.action_type == 'clickout item'
# # and it is not nan
# not_na = train.re.notna()
# and the impressions are not nans
imp_not_na = train.impressions.notna()
# only select the ones with 25 lens 
train['nimp'] = train.impressions.str.split('|').str.len()
twenty_five = train['nimp'] == 25

select_mask = is_clickout & imp_not_na & twenty_five

In [None]:
train = train[select_mask].reset_index(drop=True)

In [None]:
train.loc[train['current_filters'].isna(), 'current_filters'] = 'no_filter'
train.loc[train['reference'].isna(), 'reference'] = 'no_reference'

train['cfs'] = train['current_filters'].str.split('|')
train['imps'] = train['impressions'].str.split('|')

In [None]:
train = train[['timestamp', 'reference', 'imps']]

In [None]:
train.shape

In [None]:
all_imps = train.imps.values 
all_imps = list(set([j for i in all_imps for j in i]))
all_imps = list(set(all_imps + list(train['reference'].unique())))
imp2natural = {v: k for k, v in enumerate(all_imps)}
# only select 25 length impressions 
train['reference'] = train.reference.map(imp2natural)
train = train[train.imps.str.len()==25].reset_index(drop=True)
train['imps'] = train.imps.apply(lambda x: [imp2natural[i] for i in x])
def assign_target(row):
    ref = row.reference
    imp = row.imps
    if ref in imp:
        return imp.index(ref)
    else:
        return 25
train['target'] = train.apply(assign_target, axis=1)

In [None]:
len(imp2natural)

In [None]:
train = train[['imps', 'target']]

In [None]:
xtrain = train.iloc[:-int(len(train)*0.1)].reset_index(drop=True)
xval = train.iloc[-int(len(train)*0.1):].reset_index(drop=True)
del train

In [None]:
print(xtrain.shape, xval.shape)

In [None]:
ytrain = xtrain['target']
yval = xval['target']
del xtrain['target'], xval['target']

In [None]:
from keras import optimizers
from keras.layers import Activation, concatenate, Dense, Dropout, Embedding, Input, Reshape, Flatten
from keras.models import Model
from keras.callbacks import Callback

n_item_ids = len(imp2natural)
n_embed = 10
input_layer = Input(shape = (25, ), dtype = "int32")
impression_embedding = Embedding(n_item_ids, n_embed, input_length=25)(input_layer)
h0 = Flatten()(impression_embedding)
h1 = Dense(units=32, activation='relu')(h0)
output_layer = Dense(26, activation='softmax')(h1)


model = Model(inputs=input_layer, outputs=output_layer)
# sgd = optimizers.SGD(lr = 0.001, decay = 1e-6, momentum = 0.9, nesterov = True)
opt = optimizers.Adam()
model.compile(optimizer = opt, loss = "categorical_crossentropy", metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
batch_size = 126
n_epochs = 10
# keras requires 0, 1 binary label input
from keras.utils import to_categorical
train_y_binary = to_categorical(ytrain)
val_y_binary = to_categorical(yval)

history = model.fit(np.array(xtrain.imps.tolist()), 
                    train_y_binary, 
                    epochs=n_epochs, 
                    batch_size=batch_size,
                    validation_data=(np.array(xval.imps.tolist()), val_y_binary),
                    verbose = 2, 
                    shuffle = True)