In [149]:
import pandas as pd
import numpy as np
import os
import json

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost


ModuleNotFoundError: No module named 'xgboost'

In [34]:
def set_rf_samples(n):
    """ Changes Scikit learn's random forests to give each tree a random sample of
    n random rows.
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n))

def reset_rf_samples():
    """ Undoes the changes produced by set_rf_samples.
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n_samples))

In [35]:
PATH = '../data/'

# Load the data

In [36]:
'../data/preprocessed_df.csv'

'../data/preprocessed_df.csv'

In [37]:
nrows = None
# nrows = 2000

In [38]:
df = pd.read_hdf(f'{PATH}'+"preprocessed_df.hdf")

# Implement the loss function

In [46]:
def rmse(y_pred, y):
    y_pred, y = np.array(y_pred), np.array(y)
    return np.sqrt(np.mean((y_pred - y)**2))

# Make train & validation sets

In [132]:
# Making a validation set out of the last 3 months of data available
# df.date.min(), df.date.max()
df_train = df[df.date < pd.datetime(2017, 5, 1)].copy()
df_val = df[df.date >= pd.datetime(2017, 5, 1)].copy()

print("df_train date span: {} - {}".format(df_train.date.min(), df_train.date.max()))

print("df_val date span:   {} - {}".format(df_val.date.min(), df_val.date.max()))
print('\n')

df_train.drop("date",axis=1,inplace=True)
df_val.drop("date",axis=1,inplace=True)

train_y = df_train["totals.transactionRevenue"]
train_x = df_train.drop(["totals.transactionRevenue", 
                         "totals.transactionRevenue_na"], axis=1)

val_y = df_val["totals.transactionRevenue"]
val_x = df_val.drop(["totals.transactionRevenue", 
                         "totals.transactionRevenue_na"],axis=1)


print("train_y {}\n\
train_x {}\n\
--------------------\n\
val_y   {}\n\
val_x   {}\n".format(train_y.shape, train_x.shape, val_y.shape, val_x.shape))


df_train date span: 2016-08-01 00:00:00 - 2017-04-30 00:00:00
df_val date span:   2017-05-01 00:00:00 - 2017-08-01 00:00:00


train_y (700336,)
train_x (700336, 157)
--------------------
val_y   (203317,)
val_x   (203317, 157)



In [143]:
df_train.columns

Index(['channelGrouping_(Other)', 'channelGrouping_Affiliates',
       'channelGrouping_Direct', 'channelGrouping_Display',
       'channelGrouping_Organic Search', 'channelGrouping_Paid Search',
       'channelGrouping_Referral', 'channelGrouping_Social',
       'channelGrouping_nan', 'socialEngagementType_Not Socially Engaged',
       ...
       'trafficSource.adContent_na',
       'trafficSource.adwordsClickInfo.adNetworkType_na',
       'trafficSource.adwordsClickInfo.gclId_na',
       'trafficSource.adwordsClickInfo.isVideoAd_na',
       'trafficSource.adwordsClickInfo.page_na',
       'trafficSource.adwordsClickInfo.slot_na',
       'trafficSource.campaignCode_na', 'trafficSource.isTrueDirect_na',
       'trafficSource.keyword_na', 'trafficSource.referralPath_na'],
      dtype='object', length=159)

In [136]:
df_train["totals.transactionRevenue"].head()

0    8.581669
1    8.581669
2    8.581669
3    8.581669
4    8.581669
Name: totals.transactionRevenue, dtype: float32

In [142]:
val_y.describe()

count    203317.000000
mean          8.566182
std           0.173626
min           0.693147
25%           8.581669
50%           8.581669
75%           8.581669
max           8.581669
Name: totals.transactionRevenue, dtype: float64

In [114]:
# Run a linear model

In [115]:
model = LinearRegression()
model.fit(train_x, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [117]:
model.score(val_x, val_y)

0.02457984858822249

In [118]:
# Run a baseline Random Forest model on a subset of data 

In [119]:
model = RandomForestRegressor(n_estimators = 20, min_samples_leaf=20)
model.fit(train_x, train_y)
    
    
# m = RandomForestRegressor(n_jobs=-1)
# # The following code is supposed to fail due to string values in the input data
# m.fit(df_raw.drop('SalePrice', axis=1), df_raw.SalePrice

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=20, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [120]:
def print_score(model=model):
    train_rmse = rmse(model.predict(train_x),train_y)
    val_rmse = rmse(model.predict(val_x),val_y)
    
    print('train RMSE: {:.4} \n val RMSE: {:.4}'.format(train_rmse, val_rmse))
    
    return [train_rmse, val_rmse]
    

In [121]:
print_score(model)

train RMSE: 0.1234 
 val RMSE: 0.1599


[0.12335777186354023, 0.15991153245439504]

In [122]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)


In [123]:
len(train_x.columns)

157

In [124]:
len(model.feature_importances_)

157

In [125]:
rf_feat_importance(model,train_x)

Unnamed: 0,cols,imp
125,totals.pageviews,0.381137
124,totals.hits,0.137963
134,Day,0.040989
114,visitStartTime,0.040983
112,visitId,0.040088
129,trafficSource.referralPath,0.039951
111,sessionId,0.036764
110,fullVisitorId,0.035610
136,Dayofyear,0.031201
121,geoNetwork.networkDomain,0.026611


In [128]:
train_x.visitId

0         1472830385
1         1472880147
2         1472865386
3         1472881213
4         1472822600
5         1472807194
6         1472817241
7         1472812602
8         1472805784
9         1472812272
10        1472834967
11        1472849434
12        1472839882
13        1472803483
14        1472868337
15        1472824614
16        1472801099
17        1472826820
18        1472804607
19        1472856874
20        1472826420
21        1472863754
22        1472872530
23        1472808484
24        1472806593
25        1472816048
26        1472808002
27        1472885255
28        1472828340
29        1472839261
             ...    
903623    1483525429
903624    1483573406
903625    1483580548
903626    1483555912
903627    1483518594
903628    1483520492
903629    1483534287
903630    1483593323
903631    1483535458
903632    1483533556
903633    1483550538
903634    1483534920
903635    1483556696
903636    1483561580
903637    1483548841
903638    1483533179
903639    148

In [None]:
model.feature_importances_

In [None]:
model.score(train_x,train_y)

In [43]:
model.score(val_x,val_y)

0.4957868653123717

In [None]:
# Feature importance analysis

In [None]:
# other kinds of feature importance

In [None]:
# investigate important features 

In [None]:
# rerun the model

In [None]:
# run a random search on hyperparameters 