# Transaction Level Models

While it was expected that aggregation of features in a somewhat arbitrary manner would result in the loss of information, it was not expected that such an approach would not be able to beat the baseline score of guessing only zeros.    

In [9]:
import pandas as pd
import sqlalchemy
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMRegressor

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping

%matplotlib inline

In [6]:
engine = sqlalchemy.create_engine('postgresql://romandtse:duckthewut@localhost:5432/training')

## Column Queries

We format columns to insert into our query in this section.  These will include:
- Sum of the visit numbers (inspired by previous analysis)
- Sum of page views, assumed the more intuitive alternative over hits
- Bounce rate, at least to rule out those with a bounce rate of 100%

Let's try brute forcing our way through modeling with user level aggregation features.  First, we remind ourselves the types involved.

In [29]:
with open('../pickles/field_dict.pkl', 'rb') as f:
    field_dict = pickle.load(f)
    
with open('../pickles/useless_fields.pkl', 'rb') as f:
    useless_fields = pickle.load(f)

with open('../pickles/adwordsClickInfo_keys.pkl', 'rb') as f:
    adwordsClickInfo_keys = pickle.load(f)
    
with open('../pickles/channel_groups.pkl', 'rb') as f:
    channel_groups = pickle.load(f)
    
with open('../pickles/field_vals.pkl', 'rb') as f:
    field_vals = pickle.load(f)

In [11]:
with open('../pickles/train_customer_ids.pkl', 'rb') as f:
    train_customer_ids = pickle.load(f)

In [13]:
def revTemplate(key, name, num_type = 'FLOAT'):
    return f"""COALESCE(CAST({key} ->> '{name}' AS {num_type}), 0)/10^6 AS {name}"""

def jnumTemplate(key, name, num_type = 'INT'):
    return f"""COALESCE(CAST({key} ->> '{name}' AS {num_type}), 0) AS {name}"""

def numTemplate(name):
    return f"""COALESCE({name}, 0) AS {name}"""

def jstrTemplate(key, name):
    return f"{key} ->> '{name}' AS {name}"

def strTemplate(name):
    return f"{name}"

def adwordsTemplate(name):
    return f"CAST(trafficSource ->> 'adwordsClickInfo' AS JSONB) ->> '{name}' AS {name}"

In [14]:
def getQuery(dataset = 'train'):
    numeric_cols = ['visitNumber', 'newVisits', 'bounces', 'pageviews', 'visits', 'hits', 'transactionRevenue']
    if dataset != 'train':
        numeric_cols.pop()
    selects = []
    for cat, subcats in field_dict.items():
        for subcat in subcats:
            if subcat not in useless_fields[dataset]:
                if subcat == 'transactionRevenue':
                    selects.append(revTemplate(cat, subcat, 'NUMERIC'))
                elif subcat in numeric_cols:
                    selects.append(jnumTemplate(cat, subcat))
                elif subcat == 'adwordsClickInfo':
                    for key in adwordsClickInfo_keys:
                        selects.append(adwordsTemplate(key))
                else:
                    selects.append(jstrTemplate(cat, subcat))
    selects.extend([numTemplate('visitNumber'), 
                    strTemplate('channelGrouping'),
                    numTemplate('visitStartTime'),
                   ])
    return ', '.join(selects)

In [22]:
qstring = getQuery()

In [24]:
with open('../pickles/top_order.pkl', 'rb') as f:
    top_order = pickle.load(f)

In [25]:
def getUserData(user_id):
    query = f"""
    SELECT {qstring}
    FROM train_data
    WHERE fullVisitorId = '{user_id}'
    """

    return pd.read_sql_query(query, engine, parse_dates=['visitstarttime'])

In [26]:
getUserData(top_order.iloc[0][0])

Unnamed: 0,devicecategory,ismobile,browser,operatingsystem,city,continent,country,metro,networkdomain,region,...,campaign,campaigncode,istruedirect,keyword,medium,referralpath,source,visitnumber,channelgrouping,visitstarttime
0,desktop,false,Firefox,Windows,not available in demo dataset,Americas,United States,not available in demo dataset,comcastbusiness.net,not available in demo dataset,...,(not set),,true,(not provided),organic,,google,85,Organic Search,2016-09-02 18:42:51
1,desktop,false,Firefox,Windows,not available in demo dataset,Americas,United States,not available in demo dataset,comcastbusiness.net,not available in demo dataset,...,(not set),,true,(not provided),organic,,google,83,Organic Search,2016-09-02 13:08:23
2,desktop,false,Firefox,Windows,not available in demo dataset,Americas,United States,not available in demo dataset,comcastbusiness.net,not available in demo dataset,...,(not set),,true,(not provided),organic,,google,84,Organic Search,2016-09-02 13:43:24
3,desktop,false,Firefox,Windows,not available in demo dataset,Americas,United States,not available in demo dataset,unknown.unknown,not available in demo dataset,...,(not set),,true,(not provided),organic,,google,178,Organic Search,2017-01-26 15:33:01
4,desktop,false,Firefox,Windows,not available in demo dataset,Americas,United States,not available in demo dataset,comcastbusiness.net,not available in demo dataset,...,(not set),,true,,cpm,,dfa,309,Display,2017-06-23 20:01:04
5,desktop,false,Firefox,Windows,not available in demo dataset,Americas,United States,not available in demo dataset,comcastbusiness.net,not available in demo dataset,...,(not set),,true,(not provided),organic,,google,46,Organic Search,2016-08-11 16:59:13
6,desktop,false,Firefox,Windows,not available in demo dataset,Americas,United States,not available in demo dataset,comcastbusiness.net,not available in demo dataset,...,(not set),,true,(not provided),organic,,google,45,Organic Search,2016-08-11 16:10:24
7,desktop,false,Firefox,Windows,not available in demo dataset,Americas,United States,not available in demo dataset,comcastbusiness.net,not available in demo dataset,...,(not set),,true,,cpm,,dfa,304,Display,2017-06-13 17:11:11
8,desktop,false,Firefox,Windows,not available in demo dataset,Americas,United States,not available in demo dataset,(not set),not available in demo dataset,...,(not set),,true,(not provided),organic,,google,143,Organic Search,2016-11-18 19:53:42
9,desktop,false,Firefox,Windows,not available in demo dataset,Americas,United States,not available in demo dataset,(not set),not available in demo dataset,...,(not set),,true,(not provided),organic,,google,142,Organic Search,2016-11-18 16:26:37


In [27]:
with open('../pickles/objects.pkl', 'rb') as f:
    objects = pickle.load(f)

In [30]:
field_vals['train']['device']['deviceCategory'][0][0]

'tablet'

By introducing all possible values of fields ahead of time for dummying, independent of whether they show up in the training set or not, we fail to simulate the fact that we have no idea whether we have captured all the features.  The categories included here, though are fairly set in stone; there probably are not many sub continents that have yet to appear in the store's history.

In [31]:
def adjustCols(df, dataset='train'):
    col_order = ['bounces', 'hits', 'newvisits', 'pageviews']
    
    df = df.sort_values('visitstarttime')
    #turns out the for loop checks col_order dynamically, temporary list needed to avoid infinite loop
    to_extend = []
    for col in col_order:
        df[f'{col}last'] = df[col].shift(1)
        df[f'{col}two'] = df[col].shift(2)
        to_extend.extend([f'{col}last', f'{col}two'])
    col_order.extend(to_extend)
    df['istruedirect'] = df.istruedirect.map(lambda x: x is not None)
    df['sincelast'] = df.visitstarttime.diff().fillna(0)
    df['sincetwo'] = df.visitstarttime.diff(2).fillna(0)
    df['newvisits'] = df.newvisits.fillna(0)
    #looking for those mail.google(plex) sources
    df['fromgoogle'] = df.source.map(lambda x: 'mail.google' in x)
    for field in field_vals['train']['device']['deviceCategory']:
        df[field[0]] = df.devicecategory.map(lambda x: field[0] in x)
        col_order.append(field[0])
    for field in field_vals['train']['geoNetwork']['subContinent']:
        df[field[0]] = df.devicecategory.map(lambda x: field[0] in x)
        col_order.append(field[0])
    for field in channel_groups.values:
        df[field[0]] = df.channelgrouping.map(lambda x: field[0] in x)
        col_order.append(field[0])
    df['hour'] = df.visitstarttime.map(lambda x: x.hour)
    df['weekday'] = df.visitstarttime.map(lambda x: x.dayofweek)
    col_order.extend(['fromgoogle', 'istruedirect', 'sincelast', 'sincetwo', 'hour', 'weekday'])
    
    if dataset=='train':
        col_order.append('transactionrevenue')
    return df[col_order]

In [33]:
with open('../pickles/train_customer_ids.pkl', 'rb') as f:
    train_customer_ids = pickle.load(f)

In [34]:
with open('../pickles/train_looker_ids.pkl', 'rb') as f:
    train_looker_ids = pickle.load(f)

In [35]:
#shuffle now so we can just iterate through lists
from random import shuffle

def stratifiedIdSplit(test_size=0.5):
    customer_size = int(len(train_customer_ids)*test_size)
    looker_size = int(len(train_looker_ids)*test_size)
    
    test_customers = list(np.random.choice(train_customer_ids.T.values[0], replace=False, size=customer_size))
    test_lookers = list(np.random.choice(train_looker_ids.T.values[0], replace=False, size=looker_size))
    
    train_customers = list(set(train_customer_ids.T.values[0]).difference(set(test_customers)))
    train_lookers = list(set(train_looker_ids.T.values[0]).difference(set(test_lookers)))
    
    test_customers.extend(test_lookers)
    train_customers.extend(train_lookers)
    
    shuffle(test_customers)
    shuffle(train_customers)
    
    return train_customers, test_customers

In [36]:
train_ids, test_ids = stratifiedIdSplit(0.3)

In [35]:
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [62]:
model = Sequential([
    Dropout(0.25),
    Dense(16, input_dim=xtrain.shape[1], activation='tanh'),
    Dropout(0.25),
    Dense(8, activation='relu'),
    Dense(1)
])

model.compile(loss='mse', optimizer='adam')

In [63]:
stopper = EarlyStopping(patience=2)

In [64]:
result = model.fit(xtrain, 
          ytrain.target.values, 
          validation_data=(xtest, ytest.target.values),
          epochs = 30,
          shuffle=True,
          callbacks=[stopper]
         )

Train on 985838 samples, validate on 214251 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
144064/985838 [===>..........................] - ETA: 1:13 - loss: 3.1297

KeyboardInterrupt: 