In [1]:
import pandas as pd
import dask.dataframe as dd

import numpy as np

import plotly.express as px

from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection, preprocessing, metrics

import lightgbm as lgb

In [2]:
dtype={
    'totals.timeOnSite': 'float64',
    'totals.sessionQualityDim': 'float32',
    'totals.pageviews': 'float32',
    'fullVisitorId': 'object',
    'trafficSource.adwordsClickInfo.adNetworkType': 'object',
    'trafficSource.adwordsClickInfo.gclId': 'object',
    'trafficSource.adwordsClickInfo.slot': 'object',
    'trafficSource.campaignCode': 'object'}


In [3]:
train = dd.read_csv('data/train_v3.csv',dtype= dtype).compute()
test = dd.read_csv('data/test_v3.csv',dtype= dtype).compute()

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1708337 entries, 0 to 504
Data columns (total 41 columns):
 #   Column                                        Dtype  
---  ------                                        -----  
 0   Unnamed: 0                                    int64  
 1   channelGrouping                               object 
 2   customDimensions                              object 
 3   date                                          object 
 4   fullVisitorId                                 object 
 5   hits                                          object 
 6   visitId                                       int64  
 7   visitNumber                                   int64  
 8   visitStartTime                                int64  
 9   device.browser                                object 
 10  device.operatingSystem                        object 
 11  device.isMobile                               bool   
 12  device.deviceCategory                         object 
 13  g

In [5]:
test.drop('totals.totalTransactionRevenue', axis = 1, inplace = True)

In [6]:
train['totals.transactionRevenue'].fillna(0,inplace=True)

In [7]:
train_y = train['totals.transactionRevenue'].values
train_id = train['fullVisitorId'].values

test_id = test['fullVisitorId'].values

In [8]:
cat_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']

In [9]:
for col in cat_cols:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[col].values.astype('str'))+list(test[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))   
    test[col] = lbl.transform(list(test[col].values.astype('str')))

channelGrouping
device.browser
device.deviceCategory
device.operatingSystem
geoNetwork.city
geoNetwork.continent
geoNetwork.country
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
geoNetwork.subContinent
trafficSource.adContent
trafficSource.adwordsClickInfo.adNetworkType
trafficSource.adwordsClickInfo.gclId
trafficSource.adwordsClickInfo.page
trafficSource.adwordsClickInfo.slot
trafficSource.campaign
trafficSource.keyword
trafficSource.medium
trafficSource.referralPath
trafficSource.source
trafficSource.adwordsClickInfo.isVideoAd
trafficSource.isTrueDirect


In [10]:
num_cols = ["totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits']
for col in num_cols:
    train[col] = train[col].astype(float)
    test[col] = test[col].astype(float)

In [17]:
from datetime import datetime

In [44]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [75]:
dev_df = train[train['date']<=datetime.strptime('20171231', '%Y%m%d')]
val_df = train[train['date']> datetime.strptime('20171231', '%Y%m%d')]

In [76]:
dev_df.shape,val_df.shape

((1365253, 41), (343084, 41))

In [77]:
dev_y = np.log1p(dev_df['totals.transactionRevenue'].values)
val_y = np.log1p(val_df['totals.transactionRevenue'].values)

In [78]:
dev_X = dev_df[cat_cols + num_cols]
val_X = val_df[cat_cols + num_cols]
test_X = test[cat_cols+num_cols]

In [79]:
def run_lgb(train_X,train_y,val_X,val_y,test_X):
    params = {
        "objective":"regression",
        "metric":"rmse",
        "num_leaves":30,
        "min_child_samples":100,
        "learning_rate":0.1,
        "bagging_fraction":0.7,
        "feature_fraction":0.5,
        "bagging_frequency":5,
        "bagging_seed":2018,
        "verbosity":-1
    }
    lgtrain = lgb.Dataset(train_X,label = train_y)
    lgval = lgb.Dataset(val_X,label = val_y)
    model = lgb.train(params, lgtrain,1000,valid_sets = [lgval],early_stopping_rounds = 100, verbose_eval = 100)
    
    pred_val_y = model.predict(val_X, num_iteration = model.best_iteration)
    pred_test_y = model.predict(test_X,num_iteration = model.best_iteration)
    
    return model, pred_val_y, pred_test_y

In [80]:
model , pred_val,pred_test = run_lgb(dev_X, dev_y, val_X, val_y,test_X)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.41975
[200]	valid_0's rmse: 1.41881
Early stopping, best iteration is:
[197]	valid_0's rmse: 1.41869


In [81]:
from sklearn import metrics

pred_val[pred_val<0] = 0
pred_test[pred_test<0] = 0

val_pred_df = pd.DataFrame({'fullVisitorId':val_df['fullVisitorId'].values})
val_pred_df['transactionRevenue'] = val_df['totals.transactionRevenue'].values
val_pred_df['predictedRevenue'] = np.expm1(pred_val)

test_pred_df = pd.DataFrame({'fullVisitorId':test_id})
test_pred_df['predictedRevenue'] = np.expm1(pred_test)

In [83]:
val_pred_df = val_pred_df.groupby('fullVisitorId').agg({'transactionRevenue':'sum','predictedRevenue':'sum'}).reset_index()
test_pred_df = test_pred_df.groupby('fullVisitorId').agg({'predictedRevenue':'sum'}).reset_index()

In [85]:
np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df['transactionRevenue']),np.log1p(val_pred_df['predictedRevenue'])))

1.4902999353727124

In [86]:
test_pred_df.rename(columns={'predictedRevenue':'PredictedLogRevenue'},inplace=True)

In [90]:
test_pred_df['PredictedLogRevenue'] = np.log1p(test_pred_df['PredictedLogRevenue'])

In [91]:
test_pred_df.to_csv('submission.csv',index=False)

In [74]:
sub = pd.read_csv('submission.csv')