In [10]:

import os
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn import preprocessing
import datetime

from sklearn import metrics

Now let's read the data from our clean CSVs.

In [5]:
train_df = pd.read_csv("trainEncoded.csv")
test_df = pd.read_csv("testEncoded.csv")

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,...,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source,year,month,day,weekday
0,4,2018-05-11,7.460955e+18,1526099341,2,1526099000.0,25,1,True,0,...,25,0,516,4,1754,62,2018,5,11,4
1,2,2018-05-11,4.602525e+17,1526064483,166,1526064000.0,25,0,False,6,...,25,0,516,3,1754,0,2018,5,11,4
2,4,2018-05-11,3.461809e+18,1526067157,2,1526067000.0,25,0,False,2,...,25,0,516,4,1754,62,2018,5,11,4
3,2,2018-05-11,9.751295e+17,1526107551,4,1526108000.0,25,1,True,20,...,25,0,516,3,1754,0,2018,5,11,4
4,4,2018-05-11,8.381673e+18,1526060254,1,1526060000.0,33,2,True,17,...,25,1,516,4,1754,62,2018,5,11,4


Now let's modify the data a bit. We need to encode the string type columns into numerical type using label encoding. We will also split the train and dev data and remove some columns that seem to be unimportant.

Since the target value has such a huge range, lets take the log and predict for the log of transactionRevenue. It will be reversed using exponent function before calculating RMSE.

In [21]:
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)

train_df['date'] = pd.to_datetime(train_df['date'], format='%Y-%m-%d', errors='ignore')
test_df['date'] = pd.to_datetime(test_df['date'], format='%Y-%m-%d', errors='ignore')

train_y = train_df["totals.transactionRevenue"].values
train_id = train_df["fullVisitorId"].values

test_y = test_df["totals.transactionRevenue"].values
test_id = test_df["fullVisitorId"].values

# Split the train dataset into development and valid based on time 
dev_df = train_df[train_df['date']<=datetime.date(2018, 1, 1)]
dev_df = dev_df[dev_df['date']>=datetime.date(2017, 1, 1)]
val_df = train_df[train_df['date']>datetime.date(2018, 1, 1)]
dev_y = np.log1p(dev_df["totals.transactionRevenue"].values)
val_y = np.log1p(val_df["totals.transactionRevenue"].values)

#exclude irrelevant data like ID and also target variables from train data!
cols_to_exclude = ['totals.transactionRevenue','totals.totalTransactionRevenue','totals.transactions','date','fullVisitorId', 'visitId']
dev_X = dev_df.copy()
val_X = val_df.copy()
test_X = test_df.copy()
dev_X.drop(cols_to_exclude, axis=1, inplace=True)
val_X.drop(cols_to_exclude, axis=1, inplace=True)
test_X.drop(cols_to_exclude, axis=1, inplace=True)


'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  
'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  from ipykernel import kernelapp as app
'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  app.launch_new_instance()


Now for a baseline!

Taking all zero's is not exactly right since although most values are zeroes, we are finally looking at revenue per customer (not per visit). So let's take the mean revenue of all customers.

RMSE: 13.85

That's pretty bad!

In [23]:
%%time
#BASELINE!

val_pred_rf1 = pd.DataFrame({"fullVisitorId":val_df["fullVisitorId"].values})
val_pred_rf1["transactionRevenue"] = val_df["totals.transactionRevenue"].values

val_pred_rf1 = val_pred_rf1.groupby("fullVisitorId")["transactionRevenue"].sum().reset_index()
pred_val_group_mean = val_pred_rf1["transactionRevenue"].mean()
val_pred_rf1["PredictedRevenue"] = pred_val_group_mean
print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_rf1["transactionRevenue"].values), np.log1p(val_pred_rf1["PredictedRevenue"].values))))

13.89539511019656
CPU times: user 104 ms, sys: 17.9 ms, total: 122 ms
Wall time: 128 ms


Let's try a random forest to see if it helps.

In [24]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 100 decision trees

def run_rf(train_X, train_y, val_X, val_y, test_X):
    rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
    rf.fit(train_X, train_y);
    feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = train_X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
    print (feature_importances)
    pred_val_y = rf.predict(val_X)
    pred_test_y = rf.predict(test_X)
    return rf, pred_val_y, pred_test_y
# Train the model on training data
rf, pred_val_rf, pred_test_rf = run_rf(dev_X, dev_y, val_X, val_y, test_X);

                                              importance
totals.pageviews                                0.219785
totals.timeOnSite                               0.118630
visitStartTime                                  0.112234
day                                             0.065504
totals.hits                                     0.065438
geoNetwork.country                              0.063040
totals.sessionQualityDim                        0.052284
visitNumber                                     0.043714
geoNetwork.networkDomain                        0.038508
weekday                                         0.036407
geoNetwork.city                                 0.027396
trafficSource.referralPath                      0.024322
device.operatingSystem                          0.021062
geoNetwork.metro                                0.017790
geoNetwork.region                               0.017645
month                                           0.013310
trafficSource.isTrueDirect     

In [25]:
#validation

pred_val_rf[pred_val_rf<0] = 0
val_pred_rf = pd.DataFrame({"fullVisitorId":val_df["fullVisitorId"].values})
val_pred_rf["transactionRevenue"] = val_df["totals.transactionRevenue"].values
val_pred_rf["PredictedRevenue"] = np.expm1(pred_val_rf)
val_pred_rf = val_pred_rf.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_rf["transactionRevenue"].values), np.log1p(val_pred_rf["PredictedRevenue"].values))))

1.5089051375290736


In [26]:
#test
pred_test_rf[pred_test_rf<0] = 0
test_pred_rf = pd.DataFrame({"fullVisitorId":test_df["fullVisitorId"].values})
test_pred_rf["transactionRevenue"] = test_df["totals.transactionRevenue"].values
test_pred_rf["PredictedRevenue"] = np.expm1(pred_test_rf)
test_pred_rf = test_pred_rf.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(np.sqrt(metrics.mean_squared_error(np.log1p(test_pred_rf["transactionRevenue"].values), np.log1p(test_pred_rf["PredictedRevenue"].values))))

3.2573563949499444


That's pretty impressive! A really nominally tuned random forest did loads better than baseline!

Since this is promising, let's try to tune those hyper parameters some more. 

In [35]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 300, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(6, 20, num = 4)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [20, 51, 82, 113, 144, 175, 206, 237, 268, 300], 'max_features': ['auto', 'sqrt'], 'max_depth': [6, 10, 15, 20, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [36]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(dev_X, dev_y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 68.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 284.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 607.4min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [20, 51, 82, 113, 144, 175, 206, 237, 268, 300], 'max_features': ['auto', 'sqrt'], 'max_depth': [6, 10, 15, 20, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [37]:
rf_random.best_params_


{'n_estimators': 300,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

That took 12 hours! And we've got ourselves a new set of hyperparameter values.

Phew! Now let's see how much of a difference all that computing made.

In [39]:
%%time
best_random = rf_random.best_estimator_

def run_rf_model(model, train_X, train_y, val_X, val_y, test_X):
    #rf = RandomForestRegressor(n_estimators = 100, max_depth = 6, min_samples_split = , max_features = "sqrt", random_state = 42)
    model.fit(train_X, train_y);
    
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(test_X)
    feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = train_X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
    print (feature_importances)
    return rf, pred_val_y, pred_test_y
# Train the model on training data
rf, pred_val_rf, pred_test_rf = run_rf_model(best_random, dev_X, dev_y, val_X, val_y, test_X);

                                                importance
totals.pageviews                              1.892682e-01
totals.hits                                   1.628767e-01
totals.timeOnSite                             1.073618e-01
totals.sessionQualityDim                      9.738054e-02
visitStartTime                                6.867285e-02
day                                           4.158431e-02
visitNumber                                   3.140015e-02
geoNetwork.country                            3.008232e-02
month                                         2.741987e-02
geoNetwork.networkDomain                      2.470687e-02
weekday                                       2.427301e-02
trafficSource.referralPath                    2.285246e-02
geoNetwork.city                               2.257888e-02
geoNetwork.metro                              1.738148e-02
geoNetwork.continent                          1.653013e-02
geoNetwork.region                             1.585994e-

In [40]:
%%time
#validation

pred_val_rf[pred_val_rf<0] = 0
val_pred_rf = pd.DataFrame({"fullVisitorId":val_df["fullVisitorId"].values})
val_pred_rf["transactionRevenue"] = val_df["totals.transactionRevenue"].values
val_pred_rf["PredictedRevenue"] = np.expm1(pred_val_rf)
val_pred_rf = val_pred_rf.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_rf["transactionRevenue"].values), np.log1p(val_pred_rf["PredictedRevenue"].values))))

1.4320818834553026
CPU times: user 546 ms, sys: 133 ms, total: 679 ms
Wall time: 912 ms


In [41]:
#test
pred_test_rf[pred_test_rf<0] = 0
test_pred_rf = pd.DataFrame({"fullVisitorId":test_df["fullVisitorId"].values})
test_pred_rf["transactionRevenue"] = test_df["totals.transactionRevenue"].values
test_pred_rf["PredictedRevenue"] = np.expm1(pred_test_rf)
test_pred_rf = test_pred_rf.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(np.sqrt(metrics.mean_squared_error(np.log1p(test_pred_rf["transactionRevenue"].values), np.log1p(test_pred_rf["PredictedRevenue"].values))))

2.017829627946382
CPU times: user 655 ms, sys: 87.1 ms, total: 742 ms
Wall time: 776 ms


That's the best so far! And since we're working in the log space, I would say that's pretty impressive.

Training time of 6 minutes isn't that terrible either!

Let's see if there are any other models that do as well and then decide if this is worth exploring further. I don't want to tune my hyperparameters for another 12 hours if there is a better model out there!