In [1]:
# load necessary libraries
import numpy as np
import pandas as pd
import os
import sys
import collections

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import json
from pandas.io.json import json_normalize

import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split



#use sys.executable to see which python we are using
# to install packages do:
#<path/to/python>/python -m pip install <package>


# Install sklearn
#!{sys.executable} -m pip install sklearn

In [2]:
sys.executable

'/Users/sklasfeld/anaconda3/envs/ipykernel_py3/bin/python'

In [140]:
train_df = pd.read_csv('train_expanded.csv', low_memory=False, dtype={'fullVisitorId': 'str'}, index_col=0)
test_df = pd.read_csv('test_expanded.csv', low_memory=False, dtype={'fullVisitorId': 'str'}, index_col=0)

## Clean Data

In [69]:
[c+":"+str(train_df[c].nunique(dropna=False)) for c in train_df.columns]

['channelGrouping:8',
 'date:366',
 'fullVisitorId:714167',
 'sessionId:902755',
 'visitId:886303',
 'visitNumber:384',
 'visitStartTime:887159',
 'device.browser:54',
 'device.deviceCategory:3',
 'device.isMobile:2',
 'device.operatingSystem:20',
 'geoNetwork.city:649',
 'geoNetwork.continent:6',
 'geoNetwork.country:222',
 'geoNetwork.metro:94',
 'geoNetwork.networkDomain:28064',
 'geoNetwork.region:376',
 'geoNetwork.subContinent:23',
 'totals.bounces:2',
 'totals.hits:274',
 'totals.newVisits:2',
 'totals.pageviews:214',
 'totals.transactionRevenue:5333',
 'trafficSource.adContent:45',
 'trafficSource.adwordsClickInfo.adNetworkType:3',
 'trafficSource.adwordsClickInfo.gclId:17775',
 'trafficSource.adwordsClickInfo.isVideoAd:2',
 'trafficSource.adwordsClickInfo.page:9',
 'trafficSource.adwordsClickInfo.slot:3',
 'trafficSource.campaign:10',
 'trafficSource.isTrueDirect:2',
 'trafficSource.keyword:3660',
 'trafficSource.medium:7',
 'trafficSource.referralPath:1476',
 'trafficSource.s

### fill some NA columns 

In [141]:
# "totals.transactionRevenue" set to 0 if null
train_df["totals.transactionRevenue"].fillna(0, inplace=True)

# "totals.bounces" set to 0 if null
train_df["totals.bounces"].fillna(0, inplace=True)
test_df["totals.bounces"].fillna(0, inplace=True)

# "totals.newVisits" set to 0 if null
train_df["totals.newVisits"].fillna(0, inplace=True)
test_df["totals.newVisits"].fillna(0, inplace=True)

### Remove Columns with constant values

In [144]:
const_cols = [c for c in train_df.columns if train_df[c].nunique(dropna=False)==1 ]
train_df = train_df.drop(const_cols, axis=1)
test_df = test_df.drop(const_cols, axis=1)

only one row has a `trafficSource.campaignCode` so we can delete that one too.

### Typecast columns

type cast columns with numbers

In [None]:
# make sure that the numerical columns are float type
num_cols = ["totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits']    
for col in num_cols:
    train_df[col] = train_df[col].astype(float)
    test_df[col] = test_df[col].astype(float)

In [156]:
# make sure that these numerical columns are int type
#num_cols = ["sessionId"]    
#for col in num_cols:
#    train_df[col] = train_df[col].astype(np.int64)
#    test_df[col] = test_df[col].astype(np.int64)

Typecast columns with strings into labels

In [152]:
# convert categorical columns of strings into ints
def labelCol(df, col):
    labels=list(df[col].unique())
    label_dict={}
    for i in range(0,len(labels)):
        label_dict[labels[i]]=i
    return(df[col].apply(lambda x:label_dict[x]))

In [161]:
num_cols = [ x for x in train_df.columns if train_df[x].dtype == 'O']
for col in num_cols:
    if col != "fullVisitorId" and col in list(test_df.columns):
        print(col)
        train_df[col] = labelCol(train_df,col).astype(np.int64)
        test_df[col] = labelCol(test_df,col).astype(np.int64)

trafficSource.isTrueDirect
trafficSource.keyword
trafficSource.medium
trafficSource.referralPath
trafficSource.source


### Open up Date Column

In [163]:
# "visitStartTime" column contains the same information as "date", so it can be replaced
# i'll also generate features for the day of the week, hour, month and day of the month
train_df['date'] = pd.to_datetime(train_df['visitStartTime'], unit='s')
train_df['day_of_week'] = train_df['date'].dt.dayofweek
train_df['hour'] = train_df['date'].dt.hour
train_df['day_of_month'] = train_df['date'].dt.day
train_df['month'] = train_df['date'].dt.month

test_df['date'] = pd.to_datetime(test_df['visitStartTime'], unit='s')
test_df['day_of_week'] = test_df['date'].dt.dayofweek
test_df['hour'] = test_df['date'].dt.hour
test_df['day_of_month'] = test_df['date'].dt.day
test_df['month'] = test_df['date'].dt.month

# delete the "date" feature
train_df.drop('date', axis = 1, inplace = True)
test_df.drop('date', axis = 1, inplace = True)

## XGBoost

look for column in the training data that is not in the testing data

In [164]:
colsNotInTest=[x for x in list(train_df.columns) if x not in list(test_df.columns)]
colsNotInTest

['totals.transactionRevenue', 'trafficSource.campaignCode']

split into training and testing data into x and y sets...

In [165]:
# full training set
x_train_full = train_df.drop(["fullVisitorId"]+colsNotInTest, axis = 1)
id_train_full = train_df["fullVisitorId"].values
y_train_full = train_df["totals.transactionRevenue"].values
# full test set
X_test = test_df.drop(["fullVisitorId"], axis = 1)
id_test = test_df["fullVisitorId"].values

In [166]:
X_test.head()

Unnamed: 0,channelGrouping,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,...,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source,day_of_week,hour,day_of_month,month
0,0,0,1508151024,2.0,1508151000.0,0,0,False,0,0,...,0,0,0,0,0,0,0,10,16,10
1,0,1,1508175522,1.0,1508176000.0,0,0,False,1,1,...,0,1,0,0,0,0,0,17,16,10
2,0,2,1508143220,1.0,1508143000.0,0,0,False,0,2,...,0,1,0,0,0,0,0,8,16,10
3,0,3,1508193530,1.0,1508194000.0,1,1,True,2,3,...,0,1,0,0,0,0,0,22,16,10
4,0,4,1508217442,1.0,1508217000.0,1,0,False,0,4,...,0,1,0,0,0,0,1,5,17,10


In [168]:
X_train, X_val, y_train, y_val = train_test_split(x_train_full, y_train_full, test_size=0.15, random_state=1)
print("Train shape:" + str(X_train.shape))
print("Validation shape:" + str(X_val.shape))
print("Test (submit) shape:" + str(X_test.shape))

Train shape:(768105, 36)
Validation shape:(135548, 36)
Test (submit) shape:(804684, 36)


create rmse (root mean squared error) function

In [169]:
def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

create xgboost function

In [170]:
def run_xgb(X_train, y_train, X_val, y_val, X_test):
    params = {'objective': 'reg:linear',
              'eval_metric': 'rmse',
              'eta': 0.001,
              'max_depth': 10,
              'subsample': 0.6,
              'colsample_bytree': 0.6,
              'alpha':0.001,
              'random_state': 42,
              'silent': True}

    xgb_train_data = xgb.DMatrix(X_train, y_train)
    xgb_val_data = xgb.DMatrix(X_val, y_val)
    xgb_submit_data = xgb.DMatrix(X_test)

    model = xgb.train(params, xgb_train_data, 
                      num_boost_round=2000, 
                      evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')],
                      early_stopping_rounds=100, 
                      verbose_eval=500
                     )

    y_pred_train = model.predict(xgb_train_data, ntree_limit=model.best_ntree_limit)
    y_pred_val = model.predict(xgb_val_data, ntree_limit=model.best_ntree_limit)
    y_pred_submit = model.predict(xgb_submit_data, ntree_limit=model.best_ntree_limit)

    print("XGB : RMSE val: %f  - RMSE train: %f" % (rmse(y_val, y_pred_val),
                                                     rmse(y_train, y_pred_train)))
    return y_pred_submit, model

In [171]:
%%time
xgb_preds, xgb_model = run_xgb(X_train, y_train, X_val, y_val, X_test)

[0]	train-rmse:5.43423e+07	valid-rmse:4.30128e+07
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[500]	train-rmse:4.81833e+07	valid-rmse:4.13877e+07
[1000]	train-rmse:4.30832e+07	valid-rmse:4.02498e+07
[1500]	train-rmse:3.8748e+07	valid-rmse:3.96508e+07
[1999]	train-rmse:3.51951e+07	valid-rmse:3.93013e+07
XGB : RMSE val: 39300386.701560  - RMSE train: 35210090.520490
CPU times: user 2h 15min 35s, sys: 37.7 s, total: 2h 16min 13s
Wall time: 3h 38min 47s


In [174]:
xgb_model

<xgboost.core.Booster at 0x1c2ac13c8>

creating submission file...

In [None]:
final_ungrouped_df = pd.DataFrame({"fullVisitorId":id_test, "PredictedLogRevenue":xgb_preds})
final_ungrouped_df.head()
#sub.to_csv("xgb.csv", index = False)

In [None]:

sub = pd.DataFrame()
sub["id"] = id_test
sub["target"] = p_test
sub.to_csv("../submissions/xgb.csv", index = False)