# Transaction Level Models

While it was expected that aggregation of features in a somewhat arbitrary manner would result in the loss of information, it was not expected that such an approach would not be able to beat the baseline score of guessing only zeros.    

In [1]:
import pandas as pd
import sqlalchemy
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMRegressor
import catboost as cb

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping

%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
engine = sqlalchemy.create_engine('postgresql://romandtse:duckthewut@localhost:5432/training')

In [3]:
with open('../pickles/field_dict.pkl', 'rb') as f:
    field_dict = pickle.load(f)
    
with open('../pickles/useless_fields.pkl', 'rb') as f:
    useless_fields = pickle.load(f)

with open('../pickles/adwordsClickInfo_keys.pkl', 'rb') as f:
    adwordsClickInfo_keys = pickle.load(f)
    
with open('../pickles/channel_groups.pkl', 'rb') as f:
    channel_groups = pickle.load(f)
    
with open('../pickles/field_vals.pkl', 'rb') as f:
    field_vals = pickle.load(f)

In [4]:
with open('../pickles/train_customer_ids.pkl', 'rb') as f:
    train_customer_ids = pickle.load(f)

There is the question of scaling the revenue.  By coincidence, or maybe by design, the target is actually scaled by the natural log.  When we fit a regressor, there will necessarily be negative values.  Since we will send these values through another model, we should consider scaling everything by the natural log here; the argument is not that we pretend a single session should imitate a 

In [5]:
def revTemplate(key, name, num_type = 'FLOAT'):
    return f"""COALESCE(CAST({key} ->> '{name}' AS {num_type}), 0)/10^6 AS {name}"""

def jnumTemplate(key, name, num_type = 'INT'):
    return f"""COALESCE(CAST({key} ->> '{name}' AS {num_type}), 0) AS {name}"""

def numTemplate(name):
    return f"""COALESCE({name}, 0) AS {name}"""

def jstrTemplate(key, name):
    return f"{key} ->> '{name}' AS {name}"

def strTemplate(name):
    return f"{name}"

def adwordsTemplate(name):
    return f"CAST(trafficSource ->> 'adwordsClickInfo' AS JSONB) ->> '{name}' AS {name}"

In [6]:
def getQuery(dataset = 'train'):
    numeric_cols = ['visitNumber', 'newVisits', 'bounces', 'pageviews', 'visits', 'hits', 'transactionRevenue']
    if dataset != 'train':
        numeric_cols.pop()
    selects = []
    for cat, subcats in field_dict.items():
        for subcat in subcats:
            if subcat not in useless_fields[dataset]:
                if subcat == 'transactionRevenue':
                    selects.append(revTemplate(cat, subcat, 'NUMERIC'))
                elif subcat in numeric_cols:
                    selects.append(jnumTemplate(cat, subcat))
                elif subcat == 'adwordsClickInfo':
                    for key in adwordsClickInfo_keys:
                        selects.append(adwordsTemplate(key))
                else:
                    selects.append(jstrTemplate(cat, subcat))
    selects.extend([numTemplate('visitNumber'), 
                    strTemplate('channelGrouping'),
                    strTemplate('fullVisitorId'),
                    numTemplate('visitStartTime'),
                   ])
    return ', '.join(selects)

In [7]:
qstring = getQuery()

In [8]:
with open('../pickles/top_order.pkl', 'rb') as f:
    top_order = pickle.load(f)

In [9]:
#Note to self: if you don't change the ids list, you will lose a lot of ids.
#you will train on less data than you think, but you won't inflate the score at least by having those you excluded.
def getUserData(user_list):
    users = "\', \'".join(user_list)
    query = f"""
    SELECT {qstring}
    FROM train_data
    WHERE fullVisitorId IN (\'{users}\')
    """

    return pd.read_sql_query(query, engine, parse_dates=['visitstarttime'])

In [10]:
with open('../pickles/objects.pkl', 'rb') as f:
    objects = pickle.load(f)

By introducing all possible values of fields ahead of time for dummying, independent of whether they show up in the training set or not, we fail to simulate the fact that we have no idea whether we have captured all the features.  The categories included here, though are fairly set in stone; there probably are not many sub continents that have yet to appear in the store's history.

In [295]:
def adjustCols(df, drop_ids = True, ignore_bounce = True):
    back_looking = ['bounces', 'hits', 'newvisits', 'pageviews']
    
    df = df.sort_values(['fullvisitorid','visitstarttime'])
    #turns out the for loop checks col_order dynamically, temporary list needed to avoid infinite loop
    for col in back_looking:
        df[f'{col}last'] = df.groupby('fullvisitorid')[col].shift(1)
        df[f'{col}two'] = df.groupby('fullvisitorid')[col].shift(2)
    df['sincelast'] = df.groupby('fullvisitorid').visitstarttime.diff().map(lambda x: x.days + x.seconds/86400)
    df['sincetwo'] = df.groupby('fullvisitorid').visitstarttime.diff(2).map(lambda x: x.days + x.seconds/86400)
    df['hour'] = df.visitstarttime.map(lambda x: x.hour)
    df['weekday'] = df.visitstarttime.map(lambda x: x.dayofweek)
    df['month'] = df.visitstarttime.map(lambda x: x.month)
    
    if drop_ids:
        df = df.drop('fullvisitorid', axis=1)
        
    if ignore_bounce:
        df = df.query('bounces==0')
 
    return df.drop('visitstarttime', axis=1)

In [12]:
with open('../pickles/train_customer_ids.pkl', 'rb') as f:
    train_customer_ids = pickle.load(f)

In [13]:
with open('../pickles/train_looker_ids.pkl', 'rb') as f:
    train_looker_ids = pickle.load(f)

In [14]:
#shuffle now so we can just iterate through lists
from random import shuffle

def stratifiedIdSplit(test_size=0.5):
    customer_size = int(len(train_customer_ids)*test_size)
    looker_size = int(len(train_looker_ids)*test_size)
    
    test_customers = list(np.random.choice(train_customer_ids.T.values[0], replace=False, size=customer_size))
    test_lookers = list(np.random.choice(train_looker_ids.T.values[0], replace=False, size=looker_size))
    
    train_customers = list(set(train_customer_ids.T.values[0]).difference(set(test_customers)))
    train_lookers = list(set(train_looker_ids.T.values[0]).difference(set(test_lookers)))
    
    test_customers.extend(test_lookers)
    train_customers.extend(train_lookers)
    
    shuffle(test_customers)
    shuffle(train_customers)
    
    return train_customers, test_customers

In [15]:
train_ids, test_ids = stratifiedIdSplit(0.3)

While we can hope a machine can learn what is unique about this outlier, the fact is that there is no one else like this user.  In our ensemble later, considering it is a system of gradient boosted trees, there is a very good chance that one of the forests will be awful at guessing because it was fit to minimize the error it would get from this outlier point.  With all its activity, though, perhaps the behavior could still fall in line with the other points of data.  For the fear of overfitting to this point, and because our validation can tell us nothing about how well it does on similar outliers (because there are none like it), we remove it from our model for now.

In [16]:
try:
    train_ids.remove(top_order.iloc[0,0])
except:
    test_ids.remove(top_order.iloc[0,0])

In [17]:
getUserData([test_ids[0]]).columns

Index(['devicecategory', 'ismobile', 'browser', 'operatingsystem', 'city',
       'continent', 'country', 'metro', 'networkdomain', 'region',
       'subcontinent', 'bounces', 'hits', 'newvisits', 'pageviews',
       'transactionrevenue', 'adcontent', 'adnetworktype',
       'criteriaparameters', 'gclid', 'isvideoad', 'page', 'slot',
       'targetingcriteria', 'campaign', 'campaigncode', 'istruedirect',
       'keyword', 'medium', 'referralpath', 'source', 'visitnumber',
       'channelgrouping', 'fullvisitorid', 'visitstarttime'],
      dtype='object')

In [18]:
def createChunk(ids_list, size, drop_ids = True, ignore_bounce=True):
    new_size = len(ids_list)
    if  new_size > size:
        new_size = size
    someppl = ids_list
    shuffle(someppl)
    someppl = someppl[:new_size]
    chunk = getUserData(someppl)
    chunk = adjustCols(chunk, drop_ids, ignore_bounce)
    
    return chunk.fillna(0), ids_list[new_size:]

In [19]:
trial_df = adjustCols(getUserData([top_order.iloc[0][0]])).drop('transactionrevenue', axis=1)

In [20]:
trial_vals = adjustCols(getUserData([top_order.iloc[0][0]])).transactionrevenue

In [21]:
cat_feets = np.where(trial_df.dtypes == object)[0]

In [22]:
trial_df.columns[cat_feets]

Index(['devicecategory', 'ismobile', 'browser', 'operatingsystem', 'city',
       'continent', 'country', 'metro', 'networkdomain', 'region',
       'subcontinent', 'adcontent', 'adnetworktype', 'criteriaparameters',
       'gclid', 'isvideoad', 'page', 'slot', 'targetingcriteria', 'campaign',
       'campaigncode', 'istruedirect', 'keyword', 'medium', 'referralpath',
       'source', 'channelgrouping'],
      dtype='object')

Instead of training a set of trees and losing most of the trees, let's make an ensemble.  We can blend it and fit to the actual target with a single ensemble in two stages; we fit by session while still preventing user leakage before using these to predict the sum of all sessions and fitting on a separate validation set.

In [309]:
id_holder = train_ids
fold = 13
holdouts = 4
fold_size = len(id_holder)//fold

models = []
first_ids = id_holder[:fold_size].copy()


def trainCB(train_set, eval_pool, model_list):
    model_list.append(cb.CatBoostRegressor(iterations = 1000,
                                           learning_rate  = 5,
                                           l2_leaf_reg = 100,
                                           cat_features = cat_feets,
                                           depth = 5,
                                           verbose = True))
    model_list[-1].fit(X = train_set.drop('transactionrevenue', axis=1),
                       y = train_set.transactionrevenue,
                       #use_best_model = True, 
                       #eval_set = eval_pool, 
                       #early_stopping_rounds = 10, 
                       metric_period = 250)

eval_chunk, id_holder = createChunk(id_holder, fold_size, ignore_bounce=False)
eval_chunk = cb.Pool(eval_chunk.drop('transactionrevenue', axis=1), 
                 eval_chunk.transactionrevenue, 
                 cat_features=cat_feets)
chunk, id_holder = createChunk(id_holder, fold_size, ignore_bounce=False)
trainCB(chunk, eval_chunk, models)


for n in range(fold - holdouts - 2):
    eval_chunk = chunk
    eval_chunk = cb.Pool(eval_chunk.drop('transactionrevenue', axis=1), 
                     eval_chunk.transactionrevenue, 
                     cat_features=cat_feets)
    if n < (fold - holdouts - 3):
        chunk, id_holder = createChunk(id_holder, fold_size, ignore_bounce = False)
    else:
        chunk = createChunk(first_ids, fold_size)[0]
    trainCB(chunk, eval_chunk, models)


0:	learn: 28.5226220	total: 186ms	remaining: 3m 5s
250:	learn: 19.2779875	total: 47.7s	remaining: 2m 22s
500:	learn: 17.5186773	total: 1m 33s	remaining: 1m 33s
750:	learn: 16.3650917	total: 2m 19s	remaining: 46.4s
999:	learn: 15.6677882	total: 3m 5s	remaining: 0us
0:	learn: 23.0670469	total: 213ms	remaining: 3m 33s
250:	learn: 16.2089718	total: 46.9s	remaining: 2m 19s
500:	learn: 14.8258192	total: 1m 34s	remaining: 1m 33s
750:	learn: 14.0626531	total: 2m 21s	remaining: 47s
999:	learn: 13.4558894	total: 3m 8s	remaining: 0us
0:	learn: 22.5475749	total: 203ms	remaining: 3m 22s
250:	learn: 16.6372347	total: 45.9s	remaining: 2m 16s
500:	learn: 15.2279871	total: 1m 31s	remaining: 1m 31s
750:	learn: 14.0748189	total: 2m 17s	remaining: 45.6s
999:	learn: 13.5086973	total: 3m 3s	remaining: 0us
0:	learn: 21.6369849	total: 178ms	remaining: 2m 57s
250:	learn: 17.0823143	total: 46.3s	remaining: 2m 18s
500:	learn: 15.8162432	total: 1m 32s	remaining: 1m 32s
750:	learn: 14.7308049	total: 2m 18s	remaini

At the arbitrary fold size of 8, we still get over sixty-two thousand users, and expect a good number of users to be paying customers.  We reserve 3 of these folds for validation and predicting the log of the sums.  We will also use the previous fold for validation, mostly to save space and time to be honest, though the first and last rounds must be treated separately.

At a L2 coefficient of 100, trees tend to stop around 100 iterations.  At a coefficient of 1, they stop $O(10)$ iterations, with the RMSE still increasing only 1 unit by the end.  At 1000, the training runs for more iterations, but the gains are just as modest.  There is still the question of whether it is better to let the ensemble components overfit, but we might run into the problem we usually do with random forests.  At this rate, though, the system will do not much better than a small collection of independent decision trees.  We'll go ahead and try anyways.

In [310]:
del chunk, eval_chunk

In [None]:
with open('../models/cb_overfit_stage1.pkl', 'wb') as f:
    pickle.dump(models, f)

In [311]:
val_df = createChunk(id_holder, len(id_holder), drop_ids=False, ignore_bounce=False)[0]

In [312]:
val_x = val_df.drop('transactionrevenue', axis=1)
val_y = val_df[['fullvisitorid', 'transactionrevenue']]

We could inject zeros here, but catboost doesn't like predicting on series objects for some reason, and transforming them into dataframes and transposing those seems to take longer I think.  Should time.

In [313]:
%%time
predicts = [model.predict(val_x) for model in models]

CPU times: user 2min 16s, sys: 3.6 s, total: 2min 19s
Wall time: 1min 2s


In [314]:
bounces_index = np.where(val_x.columns == 'bounces')[0][0]

In [315]:
for i in range(len(models)):
    val_x[f'predicted_{i}'] = predicts[i]

In [316]:
kept_columns = ['fullvisitorid', 'pageviews', 'newvisits', 'visitnumber', 'bounces']
col_mask = [column in kept_columns or 'predicted_' in column for column in val_x.columns.values]
x = val_x.loc[:,col_mask].groupby('fullvisitorid').sum()
x2 = val_x.loc[:,col_mask].groupby('fullvisitorid').std()
#x3 = val_x.loc[:,col_mask].groupby('fullvisitorid').max()
x = x.join(x2, lsuffix='_mean', rsuffix='_std')
#x = x.join(x3, rsuffix='_max')

Just realized some of these will be negative.  We need to transform them with a function that runs from zero to infinity and accepts negative infinity to infinity, or else use a different model.  We try the former here.  Why not, headed down a bad way at this point.

In [317]:
y = val_y.groupby('fullvisitorid').sum().applymap(lambda x: np.log(10**6*x + 1)).loc[x.index, :]

In [318]:
split = int(len(x)*.3)

In [319]:
eval_set = cb.Pool(data = x.iloc[:split, :], label = y.iloc[:split, :])
train_set = cb.Pool(data = x.iloc[split:, :], label = y.iloc[split:, :])

In [324]:
censemble = cb.CatBoostRegressor(iterations = 3000,
                                 learning_rate  = .03,
                                 l2_leaf_reg = 100,
                                 use_best_model = True,
                                 depth = 4,
                                 verbose = True)

In [325]:
censemble.fit(train_set, eval_set=eval_set, use_best_model=True, early_stopping_rounds = 10, metric_period = 20)



0:	learn: 2.1109433	test: 2.0913975	best: 2.0913975 (0)	total: 23ms	remaining: 1m 8s
20:	learn: 1.8636055	test: 1.8473563	best: 1.8473563 (20)	total: 501ms	remaining: 1m 11s
40:	learn: 1.7516807	test: 1.7391473	best: 1.7391473 (40)	total: 871ms	remaining: 1m 2s
60:	learn: 1.7014753	test: 1.6913214	best: 1.6913214 (60)	total: 1.24s	remaining: 59.9s
80:	learn: 1.6767566	test: 1.6682762	best: 1.6682762 (80)	total: 1.59s	remaining: 57.3s
100:	learn: 1.6641107	test: 1.6573460	best: 1.6573460 (100)	total: 1.95s	remaining: 55.9s
120:	learn: 1.6567981	test: 1.6514157	best: 1.6514157 (120)	total: 2.3s	remaining: 54.7s
140:	learn: 1.6519951	test: 1.6487142	best: 1.6487142 (140)	total: 2.68s	remaining: 54.3s
160:	learn: 1.6484803	test: 1.6469983	best: 1.6469983 (160)	total: 3.02s	remaining: 53.2s
180:	learn: 1.6460921	test: 1.6458916	best: 1.6458824 (179)	total: 3.37s	remaining: 52.5s
200:	learn: 1.6440410	test: 1.6450608	best: 1.6450608 (200)	total: 3.75s	remaining: 52.3s
220:	learn: 1.6421804	t

<catboost.core.CatBoostRegressor at 0x1a7ecb1828>

In [None]:
test_df = createChunk(test_ids, len(test_ids), drop_ids=False, ignore_bounce=False)