In [1]:
import os
import json
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
from ast import literal_eval
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPClassifier, MLPRegressor


import category_encoders as ce
from sklearn import preprocessing

%matplotlib inline
pd.options.display.max_columns = 999

In [5]:
def add_time_features(df):
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='ignore')
    df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df['day'] = df['date'].apply(lambda x: x.day)
    df['weekday'] = df['date'].apply(lambda x: x.weekday())
    df['weekend'] = df['date'].apply(lambda x: x.weekday())
    
    return df

In [6]:
train_df = pd.read_csv('final\\train.csv', dtype={'fullVisitorId': 'str'})

## Feature Engineering

In [7]:
# Impute 0 for missing target values
train_df["totals.transactionRevenue"].fillna(0, inplace=True)

# label encode the categorical variables and convert the numerical variables to float 
# scikit.rf needs numerical data. One hot encoding is not good on rf.
cat_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']

#these columns should be numbers
num_cols = ["fullVisitorId", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits', 'totals.transactionRevenue']    


#ordinal encoding
encoder = ce.OrdinalEncoder(cols=cat_cols)

train_df = encoder.fit_transform(train_df, train_df["totals.transactionRevenue"])

for col in num_cols:
    train_df[col] = train_df[col].astype(float)

In [8]:
cols_to_remove = ['totals.sessionQualityDim', 'totals.timeOnSite', 'totals.totalTransactionRevenue', 'totals.transactions', 'fullVisitorId', 'visitId']
train_df.drop(cols_to_remove, axis=1, inplace=True)

In [9]:
train_df.describe()

Unnamed: 0,channelGrouping,date,device.browser,device.deviceCategory,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.transactionRevenue,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source,visitNumber,visitStartTime
count,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,871578.0,1708337.0,1307430.0,1708098.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0,1708337.0
mean,3.181249,20170160.0,3.268186,1.352495,2.984312,31.49186,2.117982,18.96126,3.135793,1614.638,19.33759,5.77206,1.0,4.429598,1.0,3.696202,1355906.0,2.107075,1.093499,1224.693,1.044063,0.09004312,1.069177,1.342838,1.312888,23.48832,2.584999,90.54825,7.627207,2.33517,1498352000.0
std,2.338352,6485.62,5.08924,0.5521176,1.748305,85.78939,0.8427465,22.14966,7.055477,5327.21,47.87271,3.249533,0.0,8.991748,0.0,6.473237,45228090.0,6.13719,0.4827646,6707.349,0.2052346,0.4259445,0.33869,1.909708,0.4636693,180.9326,1.306445,295.0553,13.30149,9.354034,16249370.0
min,1.0,20160800.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1470035000.0
25%,1.0,20161220.0,2.0,1.0,1.0,1.0,2.0,6.0,1.0,7.0,1.0,4.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1482738000.0
50%,3.0,20170710.0,2.0,1.0,3.0,1.0,2.0,6.0,1.0,48.0,1.0,5.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,3.0,3.0,1.0,2.0,1.0,1499832000.0
75%,5.0,20171200.0,3.0,2.0,4.0,25.0,3.0,30.0,2.0,317.0,15.0,6.0,1.0,4.0,1.0,4.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,2.0,3.0,4.0,4.0,8.0,1.0,1512513000.0
max,8.0,20180430.0,129.0,3.0,24.0,956.0,6.0,228.0,123.0,41982.0,483.0,23.0,1.0,500.0,1.0,500.0,23129500000.0,77.0,4.0,59009.0,2.0,12.0,4.0,33.0,2.0,4547.0,7.0,3197.0,345.0,457.0,1525158000.0


In [None]:
#scaler = preprocessing.StandardScaler()
#scaled_df = scaler.fit_transform(train_df)

In [10]:
train_df = add_time_features(train_df)

### Train validation split

In [11]:
# Get labels
y_train = train_df['totals.transactionRevenue'].values
train_df.drop(['totals.transactionRevenue'], axis=1, inplace=True)
# Log transform the labels
y_train = np.log1p(y_train)

In [12]:
# drop date and id columns
train_df.drop(['date'], axis=1, inplace=True)

In [13]:
train_df.drop(['visitStartTime','totals.pageviews', 'totals.newVisits', 'totals.bounces'], axis=1, inplace=True)

In [14]:
features = train_df.columns.values
print('TRAIN SET')
print('Rows: %s' % train_df.shape[0])
print('Columns: %s' % train_df.shape[1])
print('Features: %s' % train_df.columns.values)

TRAIN SET
Rows: 1708337
Columns: 31
Features: ['channelGrouping' 'device.browser' 'device.deviceCategory'
 'device.isMobile' 'device.operatingSystem' 'geoNetwork.city'
 'geoNetwork.continent' 'geoNetwork.country' 'geoNetwork.metro'
 'geoNetwork.networkDomain' 'geoNetwork.region' 'geoNetwork.subContinent'
 'totals.hits' 'trafficSource.adContent'
 'trafficSource.adwordsClickInfo.adNetworkType'
 'trafficSource.adwordsClickInfo.gclId'
 'trafficSource.adwordsClickInfo.isVideoAd'
 'trafficSource.adwordsClickInfo.page'
 'trafficSource.adwordsClickInfo.slot' 'trafficSource.campaign'
 'trafficSource.isTrueDirect' 'trafficSource.keyword'
 'trafficSource.medium' 'trafficSource.referralPath'
 'trafficSource.source' 'visitNumber' 'year' 'month' 'day' 'weekday'
 'weekend']


### Start modelling

In [15]:
model = MLPRegressor(activation='tanh',solver='adam',
    hidden_layer_sizes=(20,),
    max_iter=10000,
    shuffle=False,
    random_state=42)
print('model created')

model created


In [16]:
%%time

model.fit(train_df, y_train)

Wall time: 5min 18s


MLPRegressor(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=10000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=42, shuffle=False, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [17]:
test_df = pd.read_csv('final\\test.csv', dtype={'fullVisitorId': 'str'})

In [18]:
#apply encoing
test_df = encoder.transform(test_df)

for col in num_cols:
    test_df[col] = test_df[col].astype(float)

In [19]:
# extract fullVisitorId before removing it


result_df = pd.DataFrame({"fullVisitorId":test_df["fullVisitorId"].values})

In [20]:
test_df = add_time_features(test_df)

In [21]:
test_df.drop(['visitStartTime','totals.pageviews', 'totals.newVisits', 'totals.bounces'], axis=1, inplace=True)

In [22]:
y_true = test_df['totals.transactionRevenue']

additional_cols_remove = ['totals.transactionRevenue', 'date', 'fullVisitorId']
# drop
test_df.drop(cols_to_remove + additional_cols_remove, axis=1, inplace=True)

In [23]:
print('TEST SET')
print('Rows: %s' % test_df.shape[0])
print('Columns: %s' % test_df.shape[1])
print('Features: %s' % test_df.columns.values)

TEST SET
Rows: 401589
Columns: 31
Features: ['channelGrouping' 'visitNumber' 'device.browser' 'device.deviceCategory'
 'device.isMobile' 'device.operatingSystem' 'geoNetwork.city'
 'geoNetwork.continent' 'geoNetwork.country' 'geoNetwork.metro'
 'geoNetwork.networkDomain' 'geoNetwork.region' 'geoNetwork.subContinent'
 'totals.hits' 'trafficSource.adContent'
 'trafficSource.adwordsClickInfo.adNetworkType'
 'trafficSource.adwordsClickInfo.gclId'
 'trafficSource.adwordsClickInfo.isVideoAd'
 'trafficSource.adwordsClickInfo.page'
 'trafficSource.adwordsClickInfo.slot' 'trafficSource.campaign'
 'trafficSource.isTrueDirect' 'trafficSource.keyword'
 'trafficSource.medium' 'trafficSource.referralPath'
 'trafficSource.source' 'year' 'month' 'day' 'weekday' 'weekend']


In [24]:
%%time
predictions = model.predict(test_df)

Wall time: 1.54 s


In [25]:
rms = np.sqrt(mean_squared_error(y_true, predictions))

In [26]:
rms

59039153.911819585

In [27]:
predictions[predictions<0] = 0
result_df["transactionRevenue"] = y_true.values
result_df["PredictedRevenue"] = np.expm1(predictions)

result_df = result_df.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(np.sqrt(mean_squared_error(np.log1p(result_df["transactionRevenue"].values), np.log1p(result_df["PredictedRevenue"].values))))

2.8051690231232733
