In [1]:
import pandas as pd
import numpy as np
import os
import json

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import gc

In [2]:
def set_rf_samples(n):
    """ Changes Scikit learn's random forests to give each tree a random sample of
    n random rows.
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n))

def reset_rf_samples():
    """ Undoes the changes produced by set_rf_samples.
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n_samples))

In [3]:
PATH = '../data/'

# Load the data

In [4]:
'../data/preprocessed_df.csv'

'../data/preprocessed_df.csv'

In [5]:
nrows = None
# nrows = 2000

In [6]:
df = pd.read_hdf(f'{PATH}'+"preprocessed_df.hdf")

# Implement the loss function

In [7]:
def rmse(y_pred, y):
    y_pred, y = np.array(y_pred), np.array(y)
    return np.sqrt(np.mean((y_pred - y)**2))

# Make train & validation sets

In [12]:
[col for col in df.columns]

['channelGrouping_(Other)',
 'channelGrouping_Affiliates',
 'channelGrouping_Direct',
 'channelGrouping_Display',
 'channelGrouping_Organic Search',
 'channelGrouping_Paid Search',
 'channelGrouping_Referral',
 'channelGrouping_Social',
 'channelGrouping_nan',
 'socialEngagementType_Not Socially Engaged',
 'socialEngagementType_nan',
 'device.browserSize_not available in demo dataset',
 'device.browserSize_nan',
 'device.browserVersion_not available in demo dataset',
 'device.browserVersion_nan',
 'device.deviceCategory_desktop',
 'device.deviceCategory_mobile',
 'device.deviceCategory_tablet',
 'device.deviceCategory_nan',
 'device.flashVersion_not available in demo dataset',
 'device.flashVersion_nan',
 'device.language_not available in demo dataset',
 'device.language_nan',
 'device.mobileDeviceBranding_not available in demo dataset',
 'device.mobileDeviceBranding_nan',
 'device.mobileDeviceInfo_not available in demo dataset',
 'device.mobileDeviceInfo_nan',
 'device.mobileDeviceMar

In [13]:
# Making a validation set out of the last 3 months of data available
# df.date.min(), df.date.max()
# df_train = df[df.date < pd.datetime(2017, 5, 1)].copy()
# df_val = df[df.date >= pd.datetime(2017, 5, 1)].copy()

df_train = df[df.date < 20170501].copy()
df_val = df[df.date >= 20170501].copy()

print("df_train date span: {} - {}".format(df_train.date.min(), df_train.date.max()))

print("df_val date span:   {} - {}".format(df_val.date.min(), df_val.date.max()))
print('\n')

df_train.drop("date",axis=1,inplace=True)
df_val.drop("date",axis=1,inplace=True)

train_y = df_train["totals.transactionRevenue"]
train_x = df_train.drop(["totals.transactionRevenue", 
                         ], axis=1)

val_y = df_val["totals.transactionRevenue"]
val_x = df_val.drop(["totals.transactionRevenue", 
                         ],axis=1)


print("train_y {}\n\
train_x {}\n\
--------------------\n\
val_y   {}\n\
val_x   {}\n".format(train_y.shape, train_x.shape, val_y.shape, val_x.shape))


df_train date span: 20160801 - 20170430
df_val date span:   20170501 - 20170801


train_y (700336,)
train_x (700336, 144)
--------------------
val_y   (203317,)
val_x   (203317, 144)



In [14]:
df_val["totals.transactionRevenue"].describe()

count    203317.000000
mean          0.273481
std           2.190162
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          23.083095
Name: totals.transactionRevenue, dtype: float64

In [15]:
df_val["totals.transactionRevenue"][df_val["totals.transactionRevenue"] == 8.581669].count() / len(df_val["totals.transactionRevenue"])

0.0

In [16]:
del df, df_train, df_val
gc.collect()

54

In [17]:
val_y.describe()

count    203317.000000
mean          0.273481
std           2.190162
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          23.083095
Name: totals.transactionRevenue, dtype: float64

In [18]:
# Run a linear model

In [19]:
model = LinearRegression()
model.fit(train_x, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [20]:
model.score(val_x, val_y)

0.04852775085479066

In [21]:
# Run a baseline Random Forest model on a subset of data 

In [46]:
model = RandomForestRegressor(n_estimators = 100, min_samples_leaf=20, oob_score=False)
model.fit(train_x, train_y)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=20, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [23]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())


In [24]:
train_x.shape

(700336, 144)

In [25]:
train_y[:,None].shape

(700336, 1)

In [26]:
val_x.shape

(203317, 144)

In [27]:
val_y.shape

(203317,)

In [30]:
import math

In [41]:
def print_score(model=model):
    
    scores = {"train_rsquare":  model.score(train_x, train_y), 
                "val_rsquare": model.score(val_x, val_y),
                "train_rmse": rmse(model.predict(train_x), train_y),
                "val_rmse": rmse(model.predict(val_x), val_y)}
    
    print(scores)
    return 
    

In [45]:
print_score(model)

{'train_rsquare': 0.3136312622647459, 'val_rsquare': 0.21400109925165, 'train_rmse': 1.6122361900703697, 'val_rmse': 1.9417179851794746}


In [43]:
model.oob_score_

0.06706374953607497

In [34]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)


In [35]:
len(train_x.columns)

144

In [36]:
len(model.feature_importances_)

144

In [37]:
rf_feat_importance(model,train_x)

Unnamed: 0,cols,imp
125,totals.pageviews,0.479296
124,totals.hits,0.131078
129,trafficSource.referralPath,0.054614
112,visitId,0.045517
114,visitStartTime,0.042652
113,visitNumber,0.032749
111,sessionId,0.032078
110,fullVisitorId,0.029854
119,geoNetwork.country,0.019277
121,geoNetwork.networkDomain,0.018921


In [None]:
train_x.visitId

In [38]:
lgb_train = lgb.Dataset(train_x, train_y)
lgb_eval = lgb.Dataset(val_x, val_y, reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

Starting training...
[1]	valid_0's l1: 0.476127	valid_0's l2: 4.70574
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l1: 0.471588	valid_0's l2: 4.61214
[3]	valid_0's l1: 0.467605	valid_0's l2: 4.53458
[4]	valid_0's l1: 0.463538	valid_0's l2: 4.46431
[5]	valid_0's l1: 0.45983	valid_0's l2: 4.39628
[6]	valid_0's l1: 0.456254	valid_0's l2: 4.3208
[7]	valid_0's l1: 0.454399	valid_0's l2: 4.2835
[8]	valid_0's l1: 0.45078	valid_0's l2: 4.21324
[9]	valid_0's l1: 0.447535	valid_0's l2: 4.15498
[10]	valid_0's l1: 0.444955	valid_0's l2: 4.10789
[11]	valid_0's l1: 0.442493	valid_0's l2: 4.07155
[12]	valid_0's l1: 0.439944	valid_0's l2: 4.03035
[13]	valid_0's l1: 0.436886	valid_0's l2: 3.97592
[14]	valid_0's l1: 0.434479	valid_0's l2: 3.93784
[15]	valid_0's l1: 0.432746	valid_0's l2: 3.91392
[16]	valid_0's l1: 0.430391	valid_0's l2: 3.87565
[17]	valid_0's l1: 0.428051	valid_0's l2: 3.84387
[18]	valid_0's l1: 0.426481	valid_0's l2: 3.82235
[19]	valid_0's l1: 0.424922	val

In [40]:
print_score(gbm)

train RMSE: 1.754 
 val RMSE: 1.94


[1.7536174790679735, 1.9396632300245589]

In [None]:
model.feature_importances_

In [None]:
model.score(train_x,train_y)

In [None]:
model.score(val_x,val_y)

In [None]:
# Feature importance analysis

In [None]:
# other kinds of feature importance

In [None]:
# investigate important features 

In [None]:
# rerun the model

In [None]:

# run a random search on hyperparameters 

In [None]:
[col for col in train_x.columns]

In [None]:
train_x["fullVisitorId"]

In [None]:
train_y.max()

In [None]:
|