In [7]:
import os
import json
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
from ast import literal_eval
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
from sklearn.metrics import mean_squared_error


import category_encoders as ce
from sklearn import preprocessing

%matplotlib inline
pd.options.display.max_columns = 999

In [8]:
def add_time_features(df):
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='ignore')
    df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df['day'] = df['date'].apply(lambda x: x.day)
    df['weekday'] = df['date'].apply(lambda x: x.weekday())
    
    return df

In [9]:
train_df = pd.read_csv('final\\train.csv', dtype={'fullVisitorId': 'str'})

## Feature Engineering

In [10]:
# Impute 0 for missing target values
train_df["totals.transactionRevenue"].fillna(0, inplace=True)

# label encode the categorical variables and convert the numerical variables to float 
# scikit.rf needs numerical data. One hot encoding is not good on rf.
cat_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']

#these columns should be numbers
num_cols = ["fullVisitorId", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits', 'totals.transactionRevenue']    


#ordinal encoding
encoder = ce.OrdinalEncoder(cols=cat_cols)

train_df = encoder.fit_transform(train_df, train_df["totals.transactionRevenue"])

for col in num_cols:
    train_df[col] = train_df[col].astype(float)

In [11]:
train_df.head()

Unnamed: 0,channelGrouping,date,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,fullVisitorId,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source,visitId,visitNumber,visitStartTime
0,1,20160902,1,1,False,1,9.499785e+18,1,1,1,1,1,1,1,1.0,1.0,1.0,1.0,,,,0.0,,1,1,1,1,0,1,1,1,1,1,1,1,1472812272,1.0,1472812000.0
1,1,20160902,2,1,False,1,3.696907e+18,2,2,2,2,2,2,2,1.0,1.0,1.0,1.0,,,,0.0,,1,1,1,1,0,1,1,1,1,1,1,1,1472856874,1.0,1472857000.0
2,1,20160902,2,1,False,2,5.688512e+18,3,3,3,2,3,3,3,1.0,1.0,1.0,1.0,,,,0.0,,1,1,1,1,0,1,1,1,1,1,1,1,1472811524,1.0,1472812000.0
3,1,20160902,3,2,True,3,9.716453e+18,4,3,3,2,4,4,3,1.0,1.0,1.0,1.0,,,,0.0,,1,1,1,1,0,1,1,1,1,1,1,1,1472823508,1.0,1472824000.0
4,1,20160902,2,1,False,1,5.330454e+18,5,2,4,2,5,5,2,1.0,1.0,1.0,1.0,,,,0.0,,1,1,1,1,0,1,1,1,1,1,1,1,1472839619,1.0,1472840000.0


In [12]:
cols_to_remove = ['totals.sessionQualityDim', 'totals.timeOnSite', 'totals.totalTransactionRevenue', 'totals.transactions']
train_df.drop(cols_to_remove, axis=1, inplace=True)

In [13]:
train_df = add_time_features(train_df)

### Train validation split

In [14]:
X_train = train_df[train_df['date']<=datetime.date(2017, 12, 31)]
X_val = train_df[train_df['date']>datetime.date(2017, 12, 31)]

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  """Entry point for launching an IPython kernel.
'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  


In [15]:
print(X_train.shape)
print(X_val.shape)

(1365253, 38)
(343084, 38)


In [16]:
# Get labels
Y_train = X_train['totals.transactionRevenue'].values
Y_val = X_val['totals.transactionRevenue'].values
X_train = X_train.drop(['totals.transactionRevenue'], axis=1)
X_val = X_val.drop(['totals.transactionRevenue'], axis=1)
# Log transform the labels
Y_train = np.log1p(Y_train)
Y_val = np.log1p(Y_val)

In [17]:
# drop date

X_train.drop(['date'], axis=1, inplace=True)
X_val.drop(['date'], axis=1, inplace=True)

In [18]:
features = X_train.columns.values
print('TRAIN SET')
print('Rows: %s' % X_train.shape[0])
print('Columns: %s' % X_train.shape[1])
print('Features: %s' % X_train.columns.values)

TRAIN SET
Rows: 1365253
Columns: 36
Features: ['channelGrouping' 'device.browser' 'device.deviceCategory'
 'device.isMobile' 'device.operatingSystem' 'fullVisitorId'
 'geoNetwork.city' 'geoNetwork.continent' 'geoNetwork.country'
 'geoNetwork.metro' 'geoNetwork.networkDomain' 'geoNetwork.region'
 'geoNetwork.subContinent' 'totals.bounces' 'totals.hits'
 'totals.newVisits' 'totals.pageviews' 'trafficSource.adContent'
 'trafficSource.adwordsClickInfo.adNetworkType'
 'trafficSource.adwordsClickInfo.gclId'
 'trafficSource.adwordsClickInfo.isVideoAd'
 'trafficSource.adwordsClickInfo.page'
 'trafficSource.adwordsClickInfo.slot' 'trafficSource.campaign'
 'trafficSource.isTrueDirect' 'trafficSource.keyword'
 'trafficSource.medium' 'trafficSource.referralPath'
 'trafficSource.source' 'visitId' 'visitNumber' 'visitStartTime' 'year'
 'month' 'day' 'weekday']


### Start modelling

In [19]:
params = {
"objective" : "regression",
"metric" : "rmse", 
"num_leaves" : 600,
"min_child_samples" : 20,
"learning_rate" : 0.003,
"bagging_fraction" : 0.6,
"feature_fraction" : 0.7,
"bagging_frequency" : 1,
"bagging_seed" : 1,
"lambda_l1": 3,
'min_data_in_leaf': 50
}

In [20]:
lgb_train = lgb.Dataset(X_train, label=Y_train)
lgb_val = lgb.Dataset(X_val, label=Y_val)
model = lgb.train(params, lgb_train, 10000, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=100, verbose_eval=100)

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.75182	valid_1's rmse: 1.54021
[200]	training's rmse: 1.64675	valid_1's rmse: 1.48891
[300]	training's rmse: 1.57706	valid_1's rmse: 1.45977
[400]	training's rmse: 1.52862	valid_1's rmse: 1.44273
[500]	training's rmse: 1.49028	valid_1's rmse: 1.43216
[600]	training's rmse: 1.46002	valid_1's rmse: 1.42602
[700]	training's rmse: 1.43573	valid_1's rmse: 1.42255
[800]	training's rmse: 1.41446	valid_1's rmse: 1.42049
[900]	training's rmse: 1.39482	valid_1's rmse: 1.41927
[1000]	training's rmse: 1.37808	valid_1's rmse: 1.41875
[1100]	training's rmse: 1.36178	valid_1's rmse: 1.41843
[1200]	training's rmse: 1.34709	valid_1's rmse: 1.41841
[1300]	training's rmse: 1.33286	valid_1's rmse: 1.41814
[1400]	training's rmse: 1.31929	valid_1's rmse: 1.41809
Early stopping, best iteration is:
[1360]	training's rmse: 1.32461	valid_1's rmse: 1.41797


In [21]:
test_df = pd.read_csv('final\\test.csv', dtype={'fullVisitorId': 'str'})

In [22]:
#apply encoing
test_df = encoder.transform(test_df)

for col in num_cols:
    test_df[col] = test_df[col].astype(float)

In [23]:
test_df.drop(cols_to_remove, axis=1, inplace=True)

In [24]:
test_df = add_time_features(test_df)

In [25]:
y_true = test_df['totals.transactionRevenue']
# drop
test_df.drop(['totals.transactionRevenue', 'date'], axis=1, inplace=True)

In [26]:
print('TEST SET')
print('Rows: %s' % test_df.shape[0])
print('Columns: %s' % test_df.shape[1])
print('Features: %s' % test_df.columns.values)

TEST SET
Rows: 401589
Columns: 36
Features: ['channelGrouping' 'fullVisitorId' 'visitId' 'visitNumber'
 'visitStartTime' 'device.browser' 'device.deviceCategory'
 'device.isMobile' 'device.operatingSystem' 'geoNetwork.city'
 'geoNetwork.continent' 'geoNetwork.country' 'geoNetwork.metro'
 'geoNetwork.networkDomain' 'geoNetwork.region' 'geoNetwork.subContinent'
 'totals.bounces' 'totals.hits' 'totals.newVisits' 'totals.pageviews'
 'trafficSource.adContent' 'trafficSource.adwordsClickInfo.adNetworkType'
 'trafficSource.adwordsClickInfo.gclId'
 'trafficSource.adwordsClickInfo.isVideoAd'
 'trafficSource.adwordsClickInfo.page'
 'trafficSource.adwordsClickInfo.slot' 'trafficSource.campaign'
 'trafficSource.isTrueDirect' 'trafficSource.keyword'
 'trafficSource.medium' 'trafficSource.referralPath'
 'trafficSource.source' 'year' 'month' 'day' 'weekday']


In [27]:
predictions = model.predict(test_df, num_iteration=model.best_iteration)

In [28]:
rms = np.sqrt(mean_squared_error(y_true, predictions))

In [29]:
rms

59039153.91616643

In [30]:
predictions[predictions<0] = 0
result_df = pd.DataFrame({"fullVisitorId":test_df["fullVisitorId"].values})
result_df["transactionRevenue"] = y_true.values
result_df["PredictedRevenue"] = np.expm1(predictions)

result_df = result_df.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(np.sqrt(mean_squared_error(np.log1p(result_df["transactionRevenue"].values), np.log1p(result_df["PredictedRevenue"].values))))

2.1132495872184633
