# Transaction Level Models

While it was expected that aggregation of features in a somewhat arbitrary manner would result in the loss of information, it was not expected that such an approach would not be able to beat the baseline score of guessing only zeros.    

In [135]:
import pandas as pd
import sqlalchemy
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMRegressor
import catboost as cb

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping

%matplotlib inline

In [6]:
engine = sqlalchemy.create_engine('postgresql://romandtse:duckthewut@localhost:5432/training')

## Column Queries

We format columns to insert into our query in this section.  These will include:
- Sum of the visit numbers (inspired by previous analysis)
- Sum of page views, assumed the more intuitive alternative over hits
- Bounce rate, at least to rule out those with a bounce rate of 100%

Let's try brute forcing our way through modeling with user level aggregation features.  First, we remind ourselves the types involved.

In [29]:
with open('../pickles/field_dict.pkl', 'rb') as f:
    field_dict = pickle.load(f)
    
with open('../pickles/useless_fields.pkl', 'rb') as f:
    useless_fields = pickle.load(f)

with open('../pickles/adwordsClickInfo_keys.pkl', 'rb') as f:
    adwordsClickInfo_keys = pickle.load(f)
    
with open('../pickles/channel_groups.pkl', 'rb') as f:
    channel_groups = pickle.load(f)
    
with open('../pickles/field_vals.pkl', 'rb') as f:
    field_vals = pickle.load(f)

In [11]:
with open('../pickles/train_customer_ids.pkl', 'rb') as f:
    train_customer_ids = pickle.load(f)

In [13]:
def revTemplate(key, name, num_type = 'FLOAT'):
    return f"""COALESCE(CAST({key} ->> '{name}' AS {num_type}), 0)/10^6 AS {name}"""

def jnumTemplate(key, name, num_type = 'INT'):
    return f"""COALESCE(CAST({key} ->> '{name}' AS {num_type}), 0) AS {name}"""

def numTemplate(name):
    return f"""COALESCE({name}, 0) AS {name}"""

def jstrTemplate(key, name):
    return f"{key} ->> '{name}' AS {name}"

def strTemplate(name):
    return f"{name}"

def adwordsTemplate(name):
    return f"CAST(trafficSource ->> 'adwordsClickInfo' AS JSONB) ->> '{name}' AS {name}"

In [116]:
def getQuery(dataset = 'train'):
    numeric_cols = ['visitNumber', 'newVisits', 'bounces', 'pageviews', 'visits', 'hits', 'transactionRevenue']
    if dataset != 'train':
        numeric_cols.pop()
    selects = []
    for cat, subcats in field_dict.items():
        for subcat in subcats:
            if subcat not in useless_fields[dataset]:
                if subcat == 'transactionRevenue':
                    selects.append(revTemplate(cat, subcat, 'NUMERIC'))
                elif subcat in numeric_cols:
                    selects.append(jnumTemplate(cat, subcat))
                elif subcat == 'adwordsClickInfo':
                    for key in adwordsClickInfo_keys:
                        selects.append(adwordsTemplate(key))
                else:
                    selects.append(jstrTemplate(cat, subcat))
    selects.extend([numTemplate('visitNumber'), 
                    strTemplate('channelGrouping'),
                    strTemplate('fullVisitorId'),
                    numTemplate('visitStartTime'),
                   ])
    return ', '.join(selects)

In [123]:
qstring

"device ->> 'deviceCategory' AS deviceCategory, device ->> 'isMobile' AS isMobile, device ->> 'browser' AS browser, device ->> 'operatingSystem' AS operatingSystem, geoNetwork ->> 'city' AS city, geoNetwork ->> 'continent' AS continent, geoNetwork ->> 'country' AS country, geoNetwork ->> 'metro' AS metro, geoNetwork ->> 'networkDomain' AS networkDomain, geoNetwork ->> 'region' AS region, geoNetwork ->> 'subContinent' AS subContinent, COALESCE(CAST(totals ->> 'bounces' AS INT), 0) AS bounces, COALESCE(CAST(totals ->> 'hits' AS INT), 0) AS hits, COALESCE(CAST(totals ->> 'newVisits' AS INT), 0) AS newVisits, COALESCE(CAST(totals ->> 'pageviews' AS INT), 0) AS pageviews, COALESCE(CAST(totals ->> 'transactionRevenue' AS NUMERIC), 0)/10^6 AS transactionRevenue, trafficsource ->> 'adContent' AS adContent, CAST(trafficSource ->> 'adwordsClickInfo' AS JSONB) ->> 'adNetworkType' AS adNetworkType, CAST(trafficSource ->> 'adwordsClickInfo' AS JSONB) ->> 'criteriaParameters' AS criteriaParameters, 

In [122]:
qstring = getQuery()

In [24]:
with open('../pickles/top_order.pkl', 'rb') as f:
    top_order = pickle.load(f)

In [165]:
#create user_list by
def getUserData(user_list):
    users = "\', \'".join(user_list)
    query = f"""
    SELECT {qstring}
    FROM train_data
    WHERE fullVisitorId IN (\'{users}\')
    """

    return pd.read_sql_query(query, engine, parse_dates=['visitstarttime'])

In [27]:
with open('../pickles/objects.pkl', 'rb') as f:
    objects = pickle.load(f)

In [30]:
field_vals['train']['device']['deviceCategory'][0][0]

'tablet'

By introducing all possible values of fields ahead of time for dummying, independent of whether they show up in the training set or not, we fail to simulate the fact that we have no idea whether we have captured all the features.  The categories included here, though are fairly set in stone; there probably are not many sub continents that have yet to appear in the store's history.

In [186]:
60*60*24

86400

In [232]:
def adjustCols(df, dataset='train'):
    back_looking = ['bounces', 'hits', 'newvisits', 'pageviews']
    
    df = df.sort_values(['fullvisitorid','visitstarttime'])
    #turns out the for loop checks col_order dynamically, temporary list needed to avoid infinite loop
    for col in back_looking:
        df[f'{col}last'] = df.groupby('fullvisitorid')[col].shift(1)
        df[f'{col}two'] = df.groupby('fullvisitorid')[col].shift(2)
    df['sincelast'] = df.groupby('fullvisitorid').visitstarttime.diff().map(lambda x: x.days + x.seconds/86400)
    df['sincetwo'] = df.groupby('fullvisitorid').visitstarttime.diff(2).map(lambda x: x.days + x.seconds/86400)
    df['hour'] = df.visitstarttime.map(lambda x: x.hour)
    df['weekday'] = df.visitstarttime.map(lambda x: x.dayofweek)
    
    return df.drop(['fullvisitorid', 'visitstarttime'], axis=1)

In [33]:
with open('../pickles/train_customer_ids.pkl', 'rb') as f:
    train_customer_ids = pickle.load(f)

In [34]:
with open('../pickles/train_looker_ids.pkl', 'rb') as f:
    train_looker_ids = pickle.load(f)

In [35]:
#shuffle now so we can just iterate through lists
from random import shuffle

def stratifiedIdSplit(test_size=0.5):
    customer_size = int(len(train_customer_ids)*test_size)
    looker_size = int(len(train_looker_ids)*test_size)
    
    test_customers = list(np.random.choice(train_customer_ids.T.values[0], replace=False, size=customer_size))
    test_lookers = list(np.random.choice(train_looker_ids.T.values[0], replace=False, size=looker_size))
    
    train_customers = list(set(train_customer_ids.T.values[0]).difference(set(test_customers)))
    train_lookers = list(set(train_looker_ids.T.values[0]).difference(set(test_lookers)))
    
    test_customers.extend(test_lookers)
    train_customers.extend(train_lookers)
    
    shuffle(test_customers)
    shuffle(train_customers)
    
    return train_customers, test_customers

In [36]:
train_ids, test_ids = stratifiedIdSplit(0.3)

Now that we have the ids, we must train our model.  We will do this in chunks, sampling with replacement for now.  We will also pull from customers and non-customers at an equal pace, so that we see as many customers with transactions.

In [57]:
a = [1,2,3,4,5]

In [115]:
getUserData([test_ids[0]]).columns

Index(['devicecategory', 'ismobile', 'browser', 'operatingsystem', 'city',
       'continent', 'country', 'metro', 'networkdomain', 'region',
       'subcontinent', 'bounces', 'hits', 'newvisits', 'pageviews',
       'transactionrevenue', 'adcontent', 'adnetworktype',
       'criteriaparameters', 'gclid', 'isvideoad', 'page', 'slot',
       'targetingcriteria', 'campaign', 'campaigncode', 'istruedirect',
       'keyword', 'medium', 'referralpath', 'source', 'visitnumber',
       'channelgrouping', 'visitstarttime'],
      dtype='object')

In [130]:
def createChunk(ids_list, size=10000):
    new_size = len(ids_list)
    if  new_size > size:
        new_size = size
    someppl = ids_list[:new_size]
    chunk = getUserData(someppl)
    chunk = adjustCols(chunk)
    
    return chunk.fillna(0), ids_list[new_size:]

In [233]:
trial_df = adjustCols(getUserData([top_order.iloc[0][0]])).drop('transactionrevenue', axis=1)

In [263]:
cat_feets = np.where(trial_df.dtypes == object)[0]

In [264]:
cat_feets

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 31])

In [256]:
trial_df.columns[cat_feets]

Index(['devicecategory', 'ismobile', 'browser', 'operatingsystem', 'city',
       'continent', 'country', 'metro', 'networkdomain', 'region',
       'subcontinent', 'adcontent', 'adnetworktype', 'criteriaparameters',
       'gclid', 'isvideoad', 'page', 'slot', 'targetingcriteria', 'campaign',
       'campaigncode', 'istruedirect', 'keyword', 'medium', 'referralpath',
       'source', 'channelgrouping'],
      dtype='object')

In [274]:
cboost = cb.CatBoostRegressor(iterations = 10,
                              l2_leaf_reg = 1, 
                              learning_rate  = .1,
                              cat_features = cat_feets,
                              verbose = True)

In [275]:
id_holder = train_ids
chunk, id_holder = createChunk(id_holder)
pool = cb.Pool(data=chunk.drop('transactionrevenue', axis=1), 
               label=chunk.transactionrevenue, 
               cat_features = cat_feets)
i = 0
for n in range(10):
    cboost.fit(pool)
    baseline = cboost.predict()
#     chunk, id_holder = createChunk(id_holder)
#     pool = cb.Pool(data=chunk.drop('transactionrevenue', axis=1), 
#                    label=chunk.transactionrevenue, 
#                    cat_features = cat_feets)
    pool.set_baseline(baseline)
    i += 1
    #if i%10 == 0:
        

0:	learn: 27.5759373	total: 50.9ms	remaining: 458ms
1:	learn: 27.3685531	total: 147ms	remaining: 589ms
2:	learn: 27.0307578	total: 242ms	remaining: 564ms
3:	learn: 26.8856374	total: 337ms	remaining: 505ms
4:	learn: 26.6856148	total: 420ms	remaining: 420ms
5:	learn: 26.4814091	total: 507ms	remaining: 338ms
6:	learn: 26.3702799	total: 603ms	remaining: 258ms
7:	learn: 25.6702139	total: 693ms	remaining: 173ms
8:	learn: 25.5263265	total: 779ms	remaining: 86.6ms
9:	learn: 25.3824464	total: 886ms	remaining: 0us
0:	learn: 27.4239436	total: 78.7ms	remaining: 709ms
1:	learn: 27.1791703	total: 170ms	remaining: 678ms
2:	learn: 26.9681669	total: 264ms	remaining: 616ms
3:	learn: 26.5714883	total: 344ms	remaining: 516ms
4:	learn: 26.4211970	total: 450ms	remaining: 450ms
5:	learn: 26.1505852	total: 537ms	remaining: 358ms
6:	learn: 25.9342168	total: 641ms	remaining: 275ms
7:	learn: 25.7092398	total: 730ms	remaining: 182ms
8:	learn: 25.5667712	total: 837ms	remaining: 93ms
9:	learn: 25.4175490	total: 880