# Transaction Level Models

While it was expected that aggregation of features in a somewhat arbitrary manner would result in the loss of information, it was not expected that such an approach would not be able to beat the baseline score of guessing only zeros.    

In [135]:
import pandas as pd
import sqlalchemy
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMRegressor
import catboost as cb

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping

%matplotlib inline

In [6]:
engine = sqlalchemy.create_engine('postgresql://romandtse:duckthewut@localhost:5432/training')

## Column Queries

We format columns to insert into our query in this section.  These will include:
- Sum of the visit numbers (inspired by previous analysis)
- Sum of page views, assumed the more intuitive alternative over hits
- Bounce rate, at least to rule out those with a bounce rate of 100%

Let's try brute forcing our way through modeling with user level aggregation features.  First, we remind ourselves the types involved.

In [29]:
with open('../pickles/field_dict.pkl', 'rb') as f:
    field_dict = pickle.load(f)
    
with open('../pickles/useless_fields.pkl', 'rb') as f:
    useless_fields = pickle.load(f)

with open('../pickles/adwordsClickInfo_keys.pkl', 'rb') as f:
    adwordsClickInfo_keys = pickle.load(f)
    
with open('../pickles/channel_groups.pkl', 'rb') as f:
    channel_groups = pickle.load(f)
    
with open('../pickles/field_vals.pkl', 'rb') as f:
    field_vals = pickle.load(f)

In [11]:
with open('../pickles/train_customer_ids.pkl', 'rb') as f:
    train_customer_ids = pickle.load(f)

In [13]:
def revTemplate(key, name, num_type = 'FLOAT'):
    return f"""COALESCE(CAST({key} ->> '{name}' AS {num_type}), 0)/10^6 AS {name}"""

def jnumTemplate(key, name, num_type = 'INT'):
    return f"""COALESCE(CAST({key} ->> '{name}' AS {num_type}), 0) AS {name}"""

def numTemplate(name):
    return f"""COALESCE({name}, 0) AS {name}"""

def jstrTemplate(key, name):
    return f"{key} ->> '{name}' AS {name}"

def strTemplate(name):
    return f"{name}"

def adwordsTemplate(name):
    return f"CAST(trafficSource ->> 'adwordsClickInfo' AS JSONB) ->> '{name}' AS {name}"

In [116]:
def getQuery(dataset = 'train'):
    numeric_cols = ['visitNumber', 'newVisits', 'bounces', 'pageviews', 'visits', 'hits', 'transactionRevenue']
    if dataset != 'train':
        numeric_cols.pop()
    selects = []
    for cat, subcats in field_dict.items():
        for subcat in subcats:
            if subcat not in useless_fields[dataset]:
                if subcat == 'transactionRevenue':
                    selects.append(revTemplate(cat, subcat, 'NUMERIC'))
                elif subcat in numeric_cols:
                    selects.append(jnumTemplate(cat, subcat))
                elif subcat == 'adwordsClickInfo':
                    for key in adwordsClickInfo_keys:
                        selects.append(adwordsTemplate(key))
                else:
                    selects.append(jstrTemplate(cat, subcat))
    selects.extend([numTemplate('visitNumber'), 
                    strTemplate('channelGrouping'),
                    strTemplate('fullVisitorId'),
                    numTemplate('visitStartTime'),
                   ])
    return ', '.join(selects)

In [122]:
qstring = getQuery()

In [24]:
with open('../pickles/top_order.pkl', 'rb') as f:
    top_order = pickle.load(f)

In [165]:
#create user_list by
def getUserData(user_list):
    users = "\', \'".join(user_list)
    query = f"""
    SELECT {qstring}
    FROM train_data
    WHERE fullVisitorId IN (\'{users}\')
    """

    return pd.read_sql_query(query, engine, parse_dates=['visitstarttime'])

In [27]:
with open('../pickles/objects.pkl', 'rb') as f:
    objects = pickle.load(f)

In [30]:
field_vals['train']['device']['deviceCategory'][0][0]

'tablet'

By introducing all possible values of fields ahead of time for dummying, independent of whether they show up in the training set or not, we fail to simulate the fact that we have no idea whether we have captured all the features.  The categories included here, though are fairly set in stone; there probably are not many sub continents that have yet to appear in the store's history.

In [186]:
60*60*24

86400

In [232]:
def adjustCols(df, dataset='train'):
    back_looking = ['bounces', 'hits', 'newvisits', 'pageviews']
    
    df = df.sort_values(['fullvisitorid','visitstarttime'])
    #turns out the for loop checks col_order dynamically, temporary list needed to avoid infinite loop
    for col in back_looking:
        df[f'{col}last'] = df.groupby('fullvisitorid')[col].shift(1)
        df[f'{col}two'] = df.groupby('fullvisitorid')[col].shift(2)
    df['sincelast'] = df.groupby('fullvisitorid').visitstarttime.diff().map(lambda x: x.days + x.seconds/86400)
    df['sincetwo'] = df.groupby('fullvisitorid').visitstarttime.diff(2).map(lambda x: x.days + x.seconds/86400)
    df['hour'] = df.visitstarttime.map(lambda x: x.hour)
    df['weekday'] = df.visitstarttime.map(lambda x: x.dayofweek)
    
    return df.drop(['fullvisitorid', 'visitstarttime'], axis=1)

In [33]:
with open('../pickles/train_customer_ids.pkl', 'rb') as f:
    train_customer_ids = pickle.load(f)

In [34]:
with open('../pickles/train_looker_ids.pkl', 'rb') as f:
    train_looker_ids = pickle.load(f)

In [35]:
#shuffle now so we can just iterate through lists
from random import shuffle

def stratifiedIdSplit(test_size=0.5):
    customer_size = int(len(train_customer_ids)*test_size)
    looker_size = int(len(train_looker_ids)*test_size)
    
    test_customers = list(np.random.choice(train_customer_ids.T.values[0], replace=False, size=customer_size))
    test_lookers = list(np.random.choice(train_looker_ids.T.values[0], replace=False, size=looker_size))
    
    train_customers = list(set(train_customer_ids.T.values[0]).difference(set(test_customers)))
    train_lookers = list(set(train_looker_ids.T.values[0]).difference(set(test_lookers)))
    
    test_customers.extend(test_lookers)
    train_customers.extend(train_lookers)
    
    shuffle(test_customers)
    shuffle(train_customers)
    
    return train_customers, test_customers

In [36]:
train_ids, test_ids = stratifiedIdSplit(0.3)

In [315]:
try:
    train_ids.remove(top_order.iloc[0,0])
except:
    test_ids.remove(top_order.iloc[0,0])

In [115]:
getUserData([test_ids[0]]).columns

Index(['devicecategory', 'ismobile', 'browser', 'operatingsystem', 'city',
       'continent', 'country', 'metro', 'networkdomain', 'region',
       'subcontinent', 'bounces', 'hits', 'newvisits', 'pageviews',
       'transactionrevenue', 'adcontent', 'adnetworktype',
       'criteriaparameters', 'gclid', 'isvideoad', 'page', 'slot',
       'targetingcriteria', 'campaign', 'campaigncode', 'istruedirect',
       'keyword', 'medium', 'referralpath', 'source', 'visitnumber',
       'channelgrouping', 'visitstarttime'],
      dtype='object')

In [333]:
def createChunk(ids_list, size=10000):
    new_size = len(ids_list)
    if  new_size > size:
        new_size = size
    someppl = ids_list
    shuffle(someppl)
    someppl = someppl[:new_size]
    chunk = getUserData(someppl)
    chunk = adjustCols(chunk)
    
    return chunk.fillna(0), ids_list[new_size:]

In [233]:
trial_df = adjustCols(getUserData([top_order.iloc[0][0]])).drop('transactionrevenue', axis=1)

In [263]:
cat_feets = np.where(trial_df.dtypes == object)[0]

In [256]:
trial_df.columns[cat_feets]

Index(['devicecategory', 'ismobile', 'browser', 'operatingsystem', 'city',
       'continent', 'country', 'metro', 'networkdomain', 'region',
       'subcontinent', 'adcontent', 'adnetworktype', 'criteriaparameters',
       'gclid', 'isvideoad', 'page', 'slot', 'targetingcriteria', 'campaign',
       'campaigncode', 'istruedirect', 'keyword', 'medium', 'referralpath',
       'source', 'channelgrouping'],
      dtype='object')

Instead of training a set of trees and losing most of the trees, let's make an ensemble.  We can blend it and fit to the actual target at the same time.

In [338]:
models = []
id_holder = train_ids

i = 0
for n in range(3):
    models.append(cb.CatBoostRegressor(iterations = 500,
                                       learning_rate  = .01,
                                       l2_leaf_reg = 100,
                                       cat_features = cat_feets,
                                       verbose = True))
    
    eval_chunk = createChunk(test_ids, size=10000)[0]
    eval_chunk = cb.Pool(eval_chunk.drop('transactionrevenue', axis=1), 
                     eval_chunk.transactionrevenue, 
                     cat_features=cat_feets)
    chunk, id_holder = createChunk(id_holder, 200000)
    
    models[i].fit(X = chunk.drop('transactionrevenue', axis=1),
               y = chunk.transactionrevenue,
               use_best_model = True,
               eval_set = eval_chunk,
               early_stopping_rounds = 10,
               metric_period = 50)
    i += 1



0:	learn: 48.5402163	test: 18.1288499	best: 18.1288499 (0)	total: 1.53s	remaining: 12m 45s
50:	learn: 48.0358898	test: 17.4573471	best: 17.4573471 (50)	total: 50.1s	remaining: 7m 20s
100:	learn: 47.7134432	test: 17.1657098	best: 17.1657098 (100)	total: 1m 46s	remaining: 7m
150:	learn: 47.4278259	test: 16.9622723	best: 16.9622723 (150)	total: 2m 40s	remaining: 6m 10s
200:	learn: 47.2177783	test: 16.8770005	best: 16.8770005 (200)	total: 3m 35s	remaining: 5m 19s
250:	learn: 47.0499854	test: 16.8358820	best: 16.8358820 (250)	total: 4m 30s	remaining: 4m 28s
300:	learn: 46.8798508	test: 16.7676452	best: 16.7676452 (300)	total: 5m 20s	remaining: 3m 31s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 16.74211914
bestIteration = 330

Shrink model to first 331 iterations.




0:	learn: 31.4606848	test: 36.2361573	best: 36.2361573 (0)	total: 1.39s	remaining: 11m 34s
50:	learn: 30.7259039	test: 35.5534834	best: 35.5534834 (50)	total: 54.9s	remaining: 8m 3s
100:	learn: 30.3572606	test: 35.2246750	best: 35.2246750 (100)	total: 1m 53s	remaining: 7m 27s
150:	learn: 30.1465167	test: 35.0415525	best: 35.0415525 (150)	total: 2m 50s	remaining: 6m 33s
200:	learn: 30.0112226	test: 34.9168558	best: 34.9168558 (200)	total: 3m 46s	remaining: 5m 37s
250:	learn: 29.9031159	test: 34.8341956	best: 34.8341956 (250)	total: 4m 43s	remaining: 4m 41s
300:	learn: 29.8244846	test: 34.7709446	best: 34.7709446 (300)	total: 5m 38s	remaining: 3m 43s
350:	learn: 29.7616933	test: 34.7300594	best: 34.7300594 (350)	total: 6m 32s	remaining: 2m 46s
400:	learn: 29.7105514	test: 34.6933535	best: 34.6933535 (400)	total: 7m 28s	remaining: 1m 50s
450:	learn: 29.6683074	test: 34.6665705	best: 34.6665469 (449)	total: 8m 25s	remaining: 54.9s
499:	learn: 29.6315791	test: 34.6398282	best: 34.6398282 (4



0:	learn: 53.6561854	test: 31.4136747	best: 31.4136747 (0)	total: 647ms	remaining: 5m 22s
50:	learn: 53.2021717	test: 30.9177321	best: 30.9177321 (50)	total: 29.9s	remaining: 4m 23s
100:	learn: 52.9469816	test: 30.6596638	best: 30.6596638 (100)	total: 59.5s	remaining: 3m 55s
150:	learn: 52.7502911	test: 30.4948680	best: 30.4948680 (150)	total: 1m 28s	remaining: 3m 25s
200:	learn: 52.6244472	test: 30.3985902	best: 30.3985902 (200)	total: 1m 58s	remaining: 2m 56s
250:	learn: 52.4885832	test: 30.2997999	best: 30.2997999 (250)	total: 2m 28s	remaining: 2m 27s
300:	learn: 52.3829378	test: 30.2234138	best: 30.2234138 (300)	total: 2m 58s	remaining: 1m 57s
350:	learn: 52.2801877	test: 30.1486081	best: 30.1486081 (350)	total: 3m 23s	remaining: 1m 26s
400:	learn: 52.1974426	test: 30.0949433	best: 30.0949433 (400)	total: 3m 51s	remaining: 57.2s
450:	learn: 52.1229274	test: 30.0589655	best: 30.0589655 (450)	total: 4m 21s	remaining: 28.4s
499:	learn: 52.0710153	test: 30.0376473	best: 30.0374015 (497

In [339]:
with open('../models/cb_stage1.pkl', 'wb') as f:
    pickle.dump(models, f)

In [343]:
a = models[0]

In [349]:
a.predict(trial_df.fillna(0).values)

array([  1.43887379,   2.09041463,   2.13443144,  30.63082051,
         6.14186948,   4.92549653,   3.26583912,  12.23771665,
         3.64188799,   2.96331654,   2.39309946,   2.891769  ,
        55.88019241,  54.80778874,   6.76576579,   4.83453945,
        14.01917674,  43.51965988,   4.4404235 ,   3.9145384 ,
         1.97714659,   2.25224988,   1.97622336,   6.20713324,
         3.06626783,   3.20466875,   2.88152807,   2.59649341,
         2.26739905,   2.40105429,   2.40343054,  21.30958303,
        41.68076616,   4.51447914,   4.06670563,   2.64752226,
         2.25843178,   2.10092599,   2.29311546,   2.43263089,
         3.5717793 ,   2.11428051,  28.21907233,   3.09908691,
         3.03800652,   2.11943979,   2.38327876,   5.55381907,
         3.03711559,   2.79859438,   2.17268498,   2.13170997,
         2.45694892,   2.96278594,   3.25751395,   2.40100831,
         2.33981821,   2.3718854 ,   2.31465942,  84.6293663 ,
         3.57374416,   5.53296242,   2.94804094,   3.02

In [346]:
models[1].feature_importances_

array([4.85108456e-02, 7.25119994e-02, 4.55854392e-02, 1.21623825e+00,
       4.15575644e-01, 1.41576503e-01, 5.90191617e+00, 1.01879190e+00,
       0.00000000e+00, 3.52872296e-01, 1.63046754e+00, 0.00000000e+00,
       2.20779925e+00, 6.87047479e-01, 4.76432251e+01, 2.82564728e-03,
       1.99394295e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 6.20280024e-05, 0.00000000e+00, 2.24364599e-02,
       0.00000000e+00, 2.14863939e-01, 3.04441799e-04, 1.12960637e+00,
       1.04691732e-01, 5.08616319e+00, 9.59868810e+00, 3.00888174e+00,
       9.10760339e-02, 2.47730159e-02, 2.85645934e+00, 1.24323187e+00,
       5.49018910e-02, 8.44569499e-02, 2.11521717e+00, 1.04423630e+00,
       5.54662693e+00, 4.13838316e+00, 1.44773965e+00, 7.82316235e-01])

In [347]:
models[2].feature_importances_

array([4.66664420e-01, 0.00000000e+00, 1.44410648e-01, 2.17275747e+00,
       4.08309547e-01, 4.49246639e-01, 2.50887662e+00, 1.02677038e+00,
       0.00000000e+00, 9.56556733e-02, 2.30852154e-01, 0.00000000e+00,
       1.72013310e+01, 4.46517524e-01, 3.71837715e+01, 8.74014945e-02,
       3.48476501e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.22320724e-02, 0.00000000e+00, 2.00483268e-02,
       0.00000000e+00, 7.37575857e-01, 6.13022456e-02, 2.60513432e+00,
       5.12790129e-02, 5.23423000e+00, 5.66812443e+00, 9.85837553e-01,
       1.45774428e-01, 2.25418348e-03, 3.13644883e+00, 1.14502693e+00,
       9.25309034e-02, 1.77953812e-02, 3.82732536e+00, 1.18899612e+00,
       4.30960873e+00, 1.82336846e+00, 4.25300268e+00, 2.22469147e+00])