# Transaction Level Models

While it was expected that aggregation of features in a somewhat arbitrary manner would result in the loss of information, it was not expected that such an approach would not be able to beat the baseline score of guessing only zeros.    

In [1]:
import pandas as pd
import sqlalchemy
import numpy as np
from helpers import *

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
engine = sqlalchemy.create_engine('postgresql://romandtse:duckthewut@localhost:5432/training')

## Column Queries

We format columns to insert into our query in this section.  These will include:
- Sum of the visit numbers (inspired by previous analysis)
- Sum of page views, assumed the more intuitive alternative over hits
- Bounce rate, at least to rule out those with a bounce rate of 100%

Let's try brute forcing our way through modeling with user level aggregation features.  First, we remind ourselves the types involved.

In [4]:
field_dict = {}
for key in field_vals["train"].keys():
    query = f"""
        SELECT DISTINCT jsonb_object_keys({key})
        FROM train_data
    """
    field_dict[key] = [field for field in pd.read_sql_query(query, engine).jsonb_object_keys if field not in useless_fields['train']]

In [5]:
def jnumTemplate(key, name):
    return f"""SUM(COALESCE(CAST({key} ->> '{name}' AS NUMERIC), 0)) AS {name}_sum, AVG(COALESCE(CAST({key} ->> '{name}' AS NUMERIC), 0)) AS {name}_avg"""

def numTemplate(name):
    return f"""SUM(COALESCE({name}, 0)) AS {name}_sum, AVG(COALESCE({name}, 0)) AS {name}_avg"""

In [6]:
def jstrTemplate(key, name):
    return f"MODE() WITHIN GROUP (ORDER BY {key} ->> '{name}') AS {name}"

def strTemplate(name):
    return f"MODE() WITHIN GROUP (ORDER BY {name}) AS {name}"

In [7]:
def adwordsTemplate(name):
    return f"MODE() WITHIN GROUP (ORDER BY CAST(trafficSource ->> 'adwordsClickInfo' AS JSONB) ->> '{name}') AS {name}"

In [8]:
numeric_cols = ['visitNumber', 'bounces', 'pageviews', 'visits', 'hits', 'transactionRevenue']

In [9]:
selects = []
for cat, subcats in field_dict.items():
    for subcat in subcats:
        if subcat in numeric_cols:
            selects.append(jnumTemplate(cat, subcat))
        elif subcat == 'adwordsClickInfo':
            for key in adwordsClickInfo_keys:
                selects.append(adwordsTemplate(key))
        else:
            selects.append(jstrTemplate(cat, subcat))

In [10]:
selects.extend([numTemplate('visitNumber'), strTemplate('channelGrouping')])

In [11]:
query = f"""
SELECT {', '.join(selects)}
FROM train_data
GROUP BY fullVisitorId
"""

feature_summary = pd.read_sql_query(query, engine)

In [12]:
obj_cols = feature_summary.columns[feature_summary.dtypes == 'O']

In [13]:
for col in obj_cols:
    print(f'{col}: {len(feature_summary[col].dropna().unique())}')

devicecategory: 3
ismobile: 2
browser: 54
operatingsystem: 20
city: 649
continent: 6
country: 222
metro: 94
networkdomain: 27498
region: 376
subcontinent: 23
newvisits: 1
adcontent: 43
adnetworktype: 2
criteriaparameters: 1
gclid: 16229
isvideoad: 1
page: 8
slot: 2
targetingcriteria: 1
campaign: 10
campaigncode: 1
istruedirect: 1
keyword: 3519
medium: 7
referralpath: 1321
source: 331
channelgrouping: 8


In [14]:
objects = [field for field in obj_cols if len(feature_summary[field].unique()) < 11 
           and 'not available in demo dataset' not in feature_summary[field].values]

In [15]:
objects_simple = ['channelgrouping', 'continent', 'devicecategory']

In [16]:
feature_summary['target'] = np.log(feature_summary.transactionrevenue_sum + 1)

In [17]:
feature_summary.bounces_avg = 1 - feature_summary.bounces_avg

In [18]:
feature_summary['spentmoney'] = feature_summary.transactionrevenue_sum > 0

In [19]:
numeric_cols.pop()

'transactionRevenue'

In [20]:
numerics = [name.lower() + '_sum' for name in numeric_cols if name != 'visits']
numerics.extend([name.lower() + '_avg' for name in numeric_cols if name != 'visits']) 

In [21]:
feature_summary = feature_summary.fillna('none')

In [22]:
formula = f'target + spentmoney ~ {" + ".join(objects)} + {" + ".join(numerics)}'

y, x = patsy.dmatrices(formula, feature_summary, return_type='dataframe')

In [32]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.3, stratify=y['spentmoney[True]'])

In [33]:
def upsample(x, y):
    size = int(y.iloc[:,0].sum() - y.iloc[:,1].sum())
    indices = np.random.choice(y['spentmoney[True]'].index, size)
    xappend = x.loc[indices, :]
    yappend = y.loc[indices, :]
    return pd.concat([x, xappend]), pd.concat([y, yappend])

In [34]:
xtrain, ytrain = upsample(xtrain, ytrain)

In [35]:
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [62]:
model = Sequential([
    Dropout(0.25),
    Dense(16, input_dim=xtrain.shape[1], activation='tanh'),
    Dropout(0.25),
    Dense(8, activation='relu'),
    Dense(1)
])

model.compile(loss='mse', optimizer='adam')

In [63]:
stopper = EarlyStopping(patience=2)

In [64]:
result = model.fit(xtrain, 
          ytrain.target.values, 
          validation_data=(xtest, ytest.target.values),
          epochs = 30,
          shuffle=True,
          callbacks=[stopper]
         )

Train on 985838 samples, validate on 214251 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
144064/985838 [===>..........................] - ETA: 1:13 - loss: 3.1297

KeyboardInterrupt: 