# Transaction Level Models

While it was expected that aggregation of features in a somewhat arbitrary manner would result in the loss of information, it was not expected that such an approach would not be able to beat the baseline score of guessing only zeros.    

In [9]:
import pandas as pd
import sqlalchemy
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMRegressor

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping

%matplotlib inline

In [6]:
engine = sqlalchemy.create_engine('postgresql://romandtse:duckthewut@localhost:5432/training')

## Column Queries

We format columns to insert into our query in this section.  These will include:
- Sum of the visit numbers (inspired by previous analysis)
- Sum of page views, assumed the more intuitive alternative over hits
- Bounce rate, at least to rule out those with a bounce rate of 100%

Let's try brute forcing our way through modeling with user level aggregation features.  First, we remind ourselves the types involved.

In [29]:
with open('../pickles/field_dict.pkl', 'rb') as f:
    field_dict = pickle.load(f)
    
with open('../pickles/useless_fields.pkl', 'rb') as f:
    useless_fields = pickle.load(f)

with open('../pickles/adwordsClickInfo_keys.pkl', 'rb') as f:
    adwordsClickInfo_keys = pickle.load(f)
    
with open('../pickles/channel_groups.pkl', 'rb') as f:
    channel_groups = pickle.load(f)
    
with open('../pickles/field_vals.pkl', 'rb') as f:
    field_vals = pickle.load(f)

In [11]:
with open('../pickles/train_customer_ids.pkl', 'rb') as f:
    train_customer_ids = pickle.load(f)

In [13]:
def revTemplate(key, name, num_type = 'FLOAT'):
    return f"""COALESCE(CAST({key} ->> '{name}' AS {num_type}), 0)/10^6 AS {name}"""

def jnumTemplate(key, name, num_type = 'INT'):
    return f"""COALESCE(CAST({key} ->> '{name}' AS {num_type}), 0) AS {name}"""

def numTemplate(name):
    return f"""COALESCE({name}, 0) AS {name}"""

def jstrTemplate(key, name):
    return f"{key} ->> '{name}' AS {name}"

def strTemplate(name):
    return f"{name}"

def adwordsTemplate(name):
    return f"CAST(trafficSource ->> 'adwordsClickInfo' AS JSONB) ->> '{name}' AS {name}"

In [14]:
def getQuery(dataset = 'train'):
    numeric_cols = ['visitNumber', 'newVisits', 'bounces', 'pageviews', 'visits', 'hits', 'transactionRevenue']
    if dataset != 'train':
        numeric_cols.pop()
    selects = []
    for cat, subcats in field_dict.items():
        for subcat in subcats:
            if subcat not in useless_fields[dataset]:
                if subcat == 'transactionRevenue':
                    selects.append(revTemplate(cat, subcat, 'NUMERIC'))
                elif subcat in numeric_cols:
                    selects.append(jnumTemplate(cat, subcat))
                elif subcat == 'adwordsClickInfo':
                    for key in adwordsClickInfo_keys:
                        selects.append(adwordsTemplate(key))
                else:
                    selects.append(jstrTemplate(cat, subcat))
    selects.extend([numTemplate('visitNumber'), 
                    strTemplate('channelGrouping'),
                    numTemplate('visitStartTime'),
                   ])
    return ', '.join(selects)

In [22]:
qstring = getQuery()

In [24]:
with open('../pickles/top_order.pkl', 'rb') as f:
    top_order = pickle.load(f)

In [54]:
#create user_list by
def getUserData(user_list):
    users = "\', \'".join(user_list)
    query = f"""
    SELECT {qstring}
    FROM train_data
    WHERE fullVisitorId IN (\'{users}\')
    """

    return pd.read_sql_query(query, engine, parse_dates=['visitstarttime'])

In [27]:
with open('../pickles/objects.pkl', 'rb') as f:
    objects = pickle.load(f)

In [30]:
field_vals['train']['device']['deviceCategory'][0][0]

'tablet'

By introducing all possible values of fields ahead of time for dummying, independent of whether they show up in the training set or not, we fail to simulate the fact that we have no idea whether we have captured all the features.  The categories included here, though are fairly set in stone; there probably are not many sub continents that have yet to appear in the store's history.

In [31]:
def adjustCols(df, dataset='train'):
    col_order = ['bounces', 'hits', 'newvisits', 'pageviews']
    
    df = df.sort_values('visitstarttime')
    #turns out the for loop checks col_order dynamically, temporary list needed to avoid infinite loop
    to_extend = []
    for col in col_order:
        df[f'{col}last'] = df[col].shift(1)
        df[f'{col}two'] = df[col].shift(2)
        to_extend.extend([f'{col}last', f'{col}two'])
    col_order.extend(to_extend)
    df['istruedirect'] = df.istruedirect.map(lambda x: x is not None)
    df['sincelast'] = df.visitstarttime.diff().fillna(0)
    df['sincetwo'] = df.visitstarttime.diff(2).fillna(0)
    df['newvisits'] = df.newvisits.fillna(0)
    #looking for those mail.google(plex) sources
    df['fromgoogle'] = df.source.map(lambda x: 'mail.google' in x)
    for field in field_vals['train']['device']['deviceCategory']:
        df[field[0]] = df.devicecategory.map(lambda x: field[0] in x)
        col_order.append(field[0])
    for field in field_vals['train']['geoNetwork']['subContinent']:
        df[field[0]] = df.devicecategory.map(lambda x: field[0] in x)
        col_order.append(field[0])
    for field in channel_groups.values:
        df[field[0]] = df.channelgrouping.map(lambda x: field[0] in x)
        col_order.append(field[0])
    df['hour'] = df.visitstarttime.map(lambda x: x.hour)
    df['weekday'] = df.visitstarttime.map(lambda x: x.dayofweek)
    col_order.extend(['fromgoogle', 'istruedirect', 'sincelast', 'sincetwo', 'hour', 'weekday'])
    
    if dataset=='train':
        col_order.append('transactionrevenue')
    return df[col_order]

In [33]:
with open('../pickles/train_customer_ids.pkl', 'rb') as f:
    train_customer_ids = pickle.load(f)

In [34]:
with open('../pickles/train_looker_ids.pkl', 'rb') as f:
    train_looker_ids = pickle.load(f)

In [35]:
#shuffle now so we can just iterate through lists
from random import shuffle

def stratifiedIdSplit(test_size=0.5):
    customer_size = int(len(train_customer_ids)*test_size)
    looker_size = int(len(train_looker_ids)*test_size)
    
    test_customers = list(np.random.choice(train_customer_ids.T.values[0], replace=False, size=customer_size))
    test_lookers = list(np.random.choice(train_looker_ids.T.values[0], replace=False, size=looker_size))
    
    train_customers = list(set(train_customer_ids.T.values[0]).difference(set(test_customers)))
    train_lookers = list(set(train_looker_ids.T.values[0]).difference(set(test_lookers)))
    
    test_customers.extend(test_lookers)
    train_customers.extend(train_lookers)
    
    shuffle(test_customers)
    shuffle(train_customers)
    
    return train_customers, test_customers

In [36]:
train_ids, test_ids = stratifiedIdSplit(0.3)

Now that we have the ids, we must train our model.  We will do this in chunks, sampling with replacement for now.  We will also pull from customers and non-customers at an equal pace, so that we see as many customers with transactions.

In [57]:
a = [1,2,3,4,5]

In [68]:
def createChunk(customer_list, looker_list, size=1000):
    new_size = size
    shortest = min(len(customer_list), len(looker_list))
    if shortest < size:
        new_size = shortest
    everyone = customer_list[:new_size]
    everyone.extend(looker_list[:new_size])
    chunk = getUserData(everyone)
    chunk = adjustCols(chunk)
    
    return chunk, customer_list[new_size:], looker_list[new_size:]

In [None]:
train_ids[:new_size].extend([:new_size])

In [69]:
createChunk(train_ids, test_ids, 10)[0]

Unnamed: 0,bounces,hits,newvisits,pageviews,bounceslast,bouncestwo,hitslast,hitstwo,newvisitslast,newvisitstwo,...,Direct,Social,(Other),fromgoogle,istruedirect,sincelast,sincetwo,hour,weekday,transactionrevenue
12,1,1,1,1,,,,,,,...,False,True,False,False,False,0 days 00:00:00,0 days 00:00:00,8,3,0.0
8,1,1,1,1,1.0,,1.0,,1.0,,...,False,True,False,False,False,14 days 12:07:19,0 days 00:00:00,20,3,0.0
7,0,7,1,2,1.0,1.0,1.0,1.0,1.0,1.0,...,False,True,False,False,False,31 days 02:53:52,45 days 15:01:11,23,6,0.0
9,0,3,1,3,0.0,1.0,7.0,1.0,1.0,1.0,...,False,False,False,True,False,23 days 11:00:45,54 days 13:54:37,10,2,0.0
10,0,9,1,5,0.0,0.0,3.0,7.0,1.0,1.0,...,True,False,False,False,True,6 days 09:01:34,29 days 20:02:19,19,1,0.0
15,0,12,1,9,0.0,0.0,9.0,3.0,1.0,1.0,...,False,False,False,True,False,2 days 22:42:08,9 days 07:43:42,17,4,0.0
14,1,1,1,1,0.0,0.0,12.0,9.0,1.0,1.0,...,False,True,False,False,False,1 days 18:49:05,4 days 17:31:13,12,6,0.0
22,0,2,1,2,1.0,0.0,1.0,12.0,1.0,1.0,...,False,True,False,False,False,24 days 21:50:14,26 days 16:39:19,10,3,0.0
19,0,2,1,2,0.0,1.0,2.0,1.0,1.0,1.0,...,False,False,False,True,False,2 days 08:52:45,27 days 06:42:59,19,5,0.0
21,0,7,1,6,0.0,0.0,2.0,2.0,1.0,1.0,...,False,False,False,True,False,24 days 11:08:07,26 days 20:00:52,6,2,0.0


In [62]:
model = Sequential([
    Dropout(0.25),
    Dense(16, input_dim=xtrain.shape[1], activation='tanh'),
    Dropout(0.25),
    Dense(8, activation='relu'),
    Dense(1)
])

model.compile(loss='mse', optimizer='adam')

In [63]:
stopper = EarlyStopping(patience=2)

In [64]:
result = model.fit(xtrain, 
          ytrain.target.values, 
          validation_data=(xtest, ytest.target.values),
          epochs = 30,
          shuffle=True,
          callbacks=[stopper]
         )

Train on 985838 samples, validate on 214251 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
144064/985838 [===>..........................] - ETA: 1:13 - loss: 3.1297

KeyboardInterrupt: 