In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

from sklearn import model_selection, preprocessing, metrics
# import xgboost as xgb

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

# import fastai

# from fastai.structured import *
# from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
from sklearn.model_selection import train_test_split

In [3]:
def load_df(csv_path='data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [4]:
test_df = load_df("data/test.csv")

Loaded test.csv. Shape: (804684, 53)


In [5]:
train = pd.read_csv('cleaned_data/cleaned_train.csv', encoding='utf-8')
test = pd.read_csv('cleaned_data/cleaned_test.csv', encoding='utf-8')

In [6]:
cat_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect',
            'Year', 'Month', 'Week', 'Day', 'Dayofweek',
            'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end',
            'Is_quarter_start', 'Is_year_end', 'Is_year_start'
           ]

num_cols = ["totals.hits", "totals.pageviews", "visitNumber", 
            "visitStartTime", 'totals.bounces',  'totals.newVisits']

In [7]:
def score_metric(y_pred, targ):
    p = np.expm1(y_pred)
    t = np.expm1(targ)
    p[p < 0] = 0
    return np.sqrt(metrics.mean_squared_error(np.log1p(t), np.log1p(p)))

In [8]:
def val_err(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    model.fit(X_train, y_train)
    true = y_test
    preds = model.predict(X_test)
#     print(preds)
    score = score_metric(preds, true)
    print(score)
    return(score)

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
regr = RandomForestRegressor(max_depth=4, random_state=0, n_estimators=600, n_jobs=4)

In [13]:
train_rf = train[cat_cols + num_cols]
y_rf = np.log(train['totals.transactionRevenue'])
y_rf[y_rf < 0] = 0

In [14]:
val_err(regr, train_rf, y_rf)

1.7086390322603373


1.7086390322603373

In [15]:
test_rf = test[cat_cols + num_cols]

In [16]:
regr.fit(train_rf, y_rf)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=4,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [17]:
preds = regr.predict(test_rf)

In [18]:
sub_df = pd.DataFrame({"fullVisitorId":test_df['fullVisitorId']})
preds[preds<0] = 0
sub_df["PredictedLogRevenue"] = np.expm1(preds)
sub_df = sub_df.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
sub_df.columns = ["fullVisitorId", "PredictedLogRevenue"]
sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"])
sub_df.to_csv("rf_4_600.csv", index=False)

In [29]:
import xgboost as xgb

In [30]:
xgb = xgb.XGBRegressor(max_depth=4, n_estimators=600, learning_rate=0.02, gamma=0, n_jobs=4)

In [24]:
val_err(xgb, train_rf, y_rf)

1.639705625708901


1.639705625708901

In [31]:
xgb.fit(train_rf, y_rf)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=600,
       n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [32]:
preds = xgb.predict(test_rf)

In [34]:
sub_df = pd.DataFrame({"fullVisitorId":test_df['fullVisitorId']})
preds[preds<0] = 0
sub_df["PredictedLogRevenue"] = np.expm1(preds)
sub_df = sub_df.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
sub_df.columns = ["fullVisitorId", "PredictedLogRevenue"]
sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"])
sub_df.to_csv("xgb_4_600.csv", index=False)

In [35]:
from sklearn.svm import SVR

In [37]:
clf = SVR()

In [None]:
val_err(clf, train_rf, y_rf)

In [None]:
preds = xgb.predict(test_rf)

In [None]:
sub_df = pd.DataFrame({"fullVisitorId":test_df['fullVisitorId']})
preds[preds<0] = 0
sub_df["PredictedLogRevenue"] = np.expm1(preds)
sub_df = sub_df.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
sub_df.columns = ["fullVisitorId", "PredictedLogRevenue"]
sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"])
sub_df.to_csv("xgb_4_600.csv", index=False)