# Setup

In [1]:
%%capture
!pip install --upgrade xgboost

In [2]:
import os
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('precision', 4)
np.set_printoptions(precision=4)

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [3]:
data_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/dphi/juniper_networks_global_challenge/data/'

train = pd.read_csv(data_url + 'raw/train.csv')
test = pd.read_csv(data_url + 'raw/test.csv')

# Data Overview

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,observation_id,observation_timestamp,hour_of_day,register__sales_dollar_amt_this_hour,register__payment_types_accepted,register__peak_sales_dollar_amt_per_hour,register__sales_dollar_amt_last_hour,register__sales_quantity_last_hour,register__sales_quantity_rescanned_frac,register__sales_payments_declined_frac,register__peak_returns_dollar_amt_per_hour,register__returns_dollar_amt_last_hour,register__returns_quantity_last_hour,register__returns_quantity_rescanned_frac,cashier__title_level,cashier__n_years_experience,cashier__hours_into_shift,cashier__item_scan_rate_per_min,cashier__item_manual_entry_rate_per_min,store__type_code,store__miles_to_nearest_location,store__target_sales_quantity_per_hour,store__mean_customer_to_staff_ratio,store__mean_service_time_per_customer,store__n_employees_total,store__n_managers,store__n_baggers,store__n_open_registers,store__is_sufficiently_staffed,store__occupancy_main_floor,store__occupancy_grocery,store__occupancy_checkout_areas,store__occupancy_food_court,store__occupancy_backrooms,store__occupancy_indoors,store__occupancy_outdoors,store__outdoor_temperature,store__parking_lot_utilization,store__shelf_freespace_frac,store__hrs_since_last_delivery,store__sales_dollar_amt_last_hour,store__sales_quantity_last_hour,store__sales_quantity_rescanned_frac,store__gift_sales_quantity_last_hour,store__returns_dollar_amt_last_hour,store__returns_quantity_last_hour,store__returns_quantity_rescanned_frac,store__gift_returns_quantity_last_hour,region__n_stores,region__n_open_registers,region__mean_service_time_per_customer,region__stdev_service_time_per_customer,region__sales_dollar_amt_last_hour,region__returns_dollar_amt_last_hour,region__nighttime_open_registers,region__nighttime_service_time_per_customer,region__nighttime_sales_amt_per_hour,region__nighttime_returns_amt_per_hour,region__peak_sales_dollar_amt_per_hour,region__peak_sales_dollar_amt_per_hour_v2,region__peak_returns_dollar_amt_per_hour,region__peak_returns_dollar_amt_per_hour_v2
0,0,704d2a80-d52e-11ec-90ff-c7e6292284b3,2022-05-16 15:39:57,15,347.29,Cash+Credit,-0.7383,-0.127,-0.1993,-0.8299,-0.1247,-0.5721,-0.2582,-0.2621,-0.5337,-2.6036,-2.5371,1.0551,0.9156,0.6773,A,0.4564,-0.0092,0.8432,-0.6449,0.058,-0.3657,0.1657,0.464,-0.3909,-0.6325,-0.7099,-0.4386,-0.5688,0.0721,-0.8838,-0.5614,0.6376,0.1789,0.384,-0.3093,-0.5487,-0.5871,-0.0124,-0.4095,-0.4308,-0.5274,-0.3301,-0.2847,-0.7599,-0.5978,-0.9255,-0.3891,-0.692,-0.4605,-0.518,-1.0062,-0.6462,-0.603,-0.4773,0.1748,-1.7951,-0.8284
1,1,1cacc1d0-e6ac-11ec-b65d-156af70ce36b,2022-06-07 21:52:23,21,361.59,Cash+Credit,0.6483,-0.0362,-0.0777,-0.7395,-0.1135,-0.5251,-0.2157,-0.1252,-0.2853,-0.352,1.0463,-0.2628,0.9156,0.5731,A,0.7155,-0.0114,1.1964,0.6649,-0.296,1.9243,0.1657,-0.5796,-0.3909,-0.4198,0.3457,-0.4386,0.4618,-0.1019,-0.1205,-0.1695,0.6376,-1.1085,-1.4293,-1.1059,-0.4608,-0.3796,-0.0124,-0.4119,-0.1632,-0.2623,-0.3301,-0.2797,-0.7599,-0.7147,0.7815,0.0663,-0.6531,-0.4434,-0.6498,0.9031,-0.6493,-0.6106,0.4998,-0.9816,0.8939,-0.8614
2,2,6dc2b330-d37a-11ec-884e-dfe9ea4a7bd5,2022-05-14 11:38:52,11,850.73,Cash+Credit,-0.495,-0.1268,-0.1974,1.3139,0.1075,-0.5251,-0.2579,-0.2542,-0.1125,-0.352,1.0463,-0.6884,-0.0118,0.052,A,-2.1342,0.0708,-0.2166,-1.1229,0.9549,-0.3657,-0.6371,-0.7536,2.5582,-0.6413,-0.7512,3.4854,-0.2706,0.4125,0.0888,2.6556,-0.2052,1.1999,0.6431,-0.8367,-0.5488,-0.5921,-0.0124,3.5004,-0.4335,-0.5391,-0.3301,0.1894,-0.3367,-0.6782,-1.2559,-0.4826,-0.612,-0.1786,-0.604,-1.4229,-0.6456,-0.6037,1.9337,-0.9093,2.4046,-0.7567
3,3,163ee0a0-0cca-11ed-a73c-8904b24187cc,2022-07-26 10:02:41,10,1175.69,Cash+Credit,-0.5594,-0.127,-0.1991,-0.8299,-0.1247,-0.5251,-0.2582,-0.2616,-0.2958,-0.352,1.0463,-0.2024,-0.2178,-0.0522,A,-2.1342,0.1041,-0.5699,-1.1216,0.9549,-0.3657,-0.6371,-0.7536,2.5582,-0.6456,-0.7712,1.8422,0.6375,0.5038,-0.2011,1.7377,1.4289,1.2568,0.2545,0.055,-0.5492,-0.5956,-0.0124,2.1668,-0.4343,-0.5438,-0.3301,0.1188,-0.3838,-0.6709,-1.228,-0.4809,-0.5472,0.1246,-0.5925,-1.4097,-0.6478,-0.6079,0.1288,-0.9093,-0.4983,-0.7567
4,4,5e3c5df0-d5ee-11ec-a5f2-3b6f99e95850,2022-05-17 14:33:50,14,3204.53,Cash+Credit,0.5693,-0.1221,-0.1632,-0.7071,-0.1247,0.7604,-0.2565,-0.2505,-0.0756,1.8996,-0.7454,-0.3341,-0.3209,-0.4691,D,0.1973,0.1041,0.4899,0.4485,-2.9041,-0.9381,0.1657,-0.2318,-0.3909,-0.624,-0.5775,-0.4386,-0.5855,-0.177,-0.8225,-0.5903,-0.0431,-0.9823,-1.2998,2.7499,-0.5457,-0.5774,-0.0124,-0.4119,-0.4189,-0.4578,-0.3035,-0.2948,-0.6188,-0.6562,0.2442,-0.3021,-0.3838,0.1996,-0.5696,1.1231,-0.6475,-0.6032,1.005,0.1748,1.566,0.2356


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18512 entries, 0 to 18511
Data columns (total 63 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Unnamed: 0                                   18512 non-null  int64  
 1   observation_id                               18512 non-null  object 
 2   observation_timestamp                        18512 non-null  object 
 3   hour_of_day                                  18512 non-null  int64  
 4   register__sales_dollar_amt_this_hour         18512 non-null  float64
 5   register__payment_types_accepted             18512 non-null  object 
 6   register__peak_sales_dollar_amt_per_hour     18512 non-null  float64
 7   register__sales_dollar_amt_last_hour         18512 non-null  float64
 8   register__sales_quantity_last_hour           18512 non-null  float64
 9   register__sales_quantity_rescanned_frac      18512 non-null  float64
 10

In [6]:
test.head()

Unnamed: 0,observation_id,observation_timestamp,hour_of_day,register__payment_types_accepted,register__peak_sales_dollar_amt_per_hour,register__sales_dollar_amt_last_hour,register__sales_quantity_last_hour,register__sales_quantity_rescanned_frac,register__sales_payments_declined_frac,register__peak_returns_dollar_amt_per_hour,register__returns_dollar_amt_last_hour,register__returns_quantity_last_hour,register__returns_quantity_rescanned_frac,cashier__title_level,cashier__n_years_experience,cashier__hours_into_shift,cashier__item_scan_rate_per_min,cashier__item_manual_entry_rate_per_min,store__type_code,store__miles_to_nearest_location,store__target_sales_quantity_per_hour,store__mean_customer_to_staff_ratio,store__mean_service_time_per_customer,store__n_employees_total,store__n_managers,store__n_baggers,store__n_open_registers,store__is_sufficiently_staffed,store__occupancy_main_floor,store__occupancy_grocery,store__occupancy_checkout_areas,store__occupancy_food_court,store__occupancy_backrooms,store__occupancy_indoors,store__occupancy_outdoors,store__outdoor_temperature,store__parking_lot_utilization,store__shelf_freespace_frac,store__hrs_since_last_delivery,store__sales_dollar_amt_last_hour,store__sales_quantity_last_hour,store__sales_quantity_rescanned_frac,store__gift_sales_quantity_last_hour,store__returns_dollar_amt_last_hour,store__returns_quantity_last_hour,store__returns_quantity_rescanned_frac,store__gift_returns_quantity_last_hour,region__n_stores,region__n_open_registers,region__mean_service_time_per_customer,region__stdev_service_time_per_customer,region__sales_dollar_amt_last_hour,region__returns_dollar_amt_last_hour,region__nighttime_open_registers,region__nighttime_service_time_per_customer,region__nighttime_sales_amt_per_hour,region__nighttime_returns_amt_per_hour,region__peak_sales_dollar_amt_per_hour,region__peak_sales_dollar_amt_per_hour_v2,region__peak_returns_dollar_amt_per_hour,region__peak_returns_dollar_amt_per_hour_v2
0,3f483640-bc52-11ec-b736-8544dc068949,15/04/2022 00:23,0,Cash+Credit,0.5693,-0.1253,-0.1489,-0.8176,-0.1066,0.9103,-0.193,-0.2177,-0.5078,1.8996,-0.7454,-0.709,2.049,1.8238,C,0.9745,0.1041,0.8432,0.9297,1.3797,-0.9381,0.1657,-0.5796,-0.3909,-0.6463,-0.8466,0.9721,0.0288,-0.1693,-0.5504,0.7565,-0.2052,0.8869,0.9021,-1.034,-0.5493,-0.5959,-0.0124,1.4187,-0.4345,-0.5477,-0.3301,-0.2948,-0.6658,-0.6307,-0.132,0.3398,-0.6373,-0.3791,-0.5581,-0.7148,-0.6487,-0.6091,1.002,-0.5479,0.8761,
1,a88b0bb0-d2ae-11ec-bd0e-f5a7c7895456,13/05/2022 11:20,11,Cash+Credit,0.3084,-0.127,-0.1986,1.6562,-0.1247,-0.5251,-0.2581,-0.2601,-0.5337,-0.352,1.0463,0.0167,1.1217,0.99,C,0.4564,-0.1891,0.4899,0.1748,0.9549,0.2068,0.1657,-0.7536,-0.3909,-0.6438,-0.6615,-0.4386,-0.8726,-0.0441,-0.9067,-0.6919,0.8842,0.9393,-0.2636,-0.4222,-0.5492,-0.5948,-0.0124,-0.4107,-0.4344,-0.5459,0.4436,-0.2948,-0.7599,-0.722,0.8413,0.1972,-0.6483,-0.4419,-0.6498,0.8867,-0.6493,-0.6106,-1.9779,-0.9816,-0.1311,-0.8614
2,90611780-d607-11ec-a5d8-675445c92326,17/05/2022 17:34,17,Cash+Credit,-0.6757,0.1925,0.2534,-0.7517,-0.1247,-0.5251,0.8384,0.7034,-0.2254,-0.352,-0.7454,-0.2369,0.0913,0.3647,A,0.1973,-0.0158,-0.9231,-0.1971,-0.296,-0.3657,-0.6371,-0.0578,-0.3909,2.2737,0.662,0.0212,-0.5885,-0.1733,1.5244,-0.2315,-1.4046,-1.0434,-0.2636,-0.0714,1.8831,1.6804,-0.0124,0.0797,0.5216,0.5253,-0.1531,-0.199,1.4968,1.007,-0.3185,-0.4461,0.6712,0.3971,2.0094,0.0464,1.8275,2.0689,0.0332,1.3795,-0.3489,1.3382
3,a98380e0-e2d2-11ec-b28f-7f4d381958ee,03/06/2022 00:18,0,Cash+Credit,-0.1747,-0.1258,-0.1709,-0.3576,-0.1247,-0.5251,-0.2304,-0.2174,-0.5308,-0.352,1.0463,0.2179,1.0186,0.8858,C,0.9745,0.0508,0.4899,0.3391,0.9549,0.2068,0.1657,-0.7536,-0.3909,-0.556,-0.4236,-0.4386,-0.5051,-0.1467,-0.6975,-0.5558,0.2307,0.9887,0.7726,0.9128,-0.4882,-0.5392,-0.0124,-0.4095,-0.431,-0.4811,-0.3301,-0.2948,-0.7834,-0.722,0.8032,0.1548,-0.6653,-0.4437,-0.6498,0.8937,-0.6493,-0.6106,0.4998,-0.9816,0.8939,-0.8614
4,608b8e60-e4f2-11ec-8115-d3258731976b,05/06/2022 17:10,17,Cash+Credit,0.6483,-0.0612,-0.0995,0.106,-0.1247,-0.5251,-0.2169,-0.1434,-0.5186,-0.352,1.0463,-0.5244,0.9156,0.7816,C,0.9745,0.0463,0.4899,0.591,0.9549,0.2068,0.1657,-0.7536,-0.3909,-0.4622,0.2266,-0.3946,0.4046,-0.1019,-0.1996,-0.1578,0.1216,1.0034,0.7726,1.0884,-0.4619,-0.4192,-0.0124,-0.3288,-0.2378,-0.3132,-0.3301,-0.2645,-0.7834,-0.722,0.9011,0.1806,-0.6495,-0.4418,-0.6498,0.9006,-0.6493,-0.6106,0.4998,-0.9816,0.8939,-0.8614


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5179 entries, 0 to 5178
Data columns (total 61 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   observation_id                               5179 non-null   object 
 1   observation_timestamp                        5179 non-null   object 
 2   hour_of_day                                  5179 non-null   int64  
 3   register__payment_types_accepted             5179 non-null   object 
 4   register__peak_sales_dollar_amt_per_hour     5179 non-null   float64
 5   register__sales_dollar_amt_last_hour         5179 non-null   float64
 6   register__sales_quantity_last_hour           5179 non-null   float64
 7   register__sales_quantity_rescanned_frac      5179 non-null   float64
 8   register__sales_payments_declined_frac       5179 non-null   float64
 9   register__peak_returns_dollar_amt_per_hour   5179 non-null   float64
 10  

**Checking missing values**

In [8]:
train.isna().any().any(), test.isna().any().any()

(True, True)

*region__peak_sales_dollar_amt_per_hour_v2* and *region__peak_returns_dollar_amt_per_hour_v2* have missing values in both datasets.  
However, these are the experimental features in the dataset which are not supposed to be used. Thus, we can simply drop these columns instead of imputing the missing values.

**Dropping useless column**

In [9]:
train = train.drop(
    labels=[
        'Unnamed: 0', 
        'region__peak_sales_dollar_amt_per_hour_v2', 
        'region__peak_returns_dollar_amt_per_hour_v2'
    ], 
    axis=1
)

test = test.drop(
    labels=[
        'region__peak_sales_dollar_amt_per_hour_v2', 
        'region__peak_returns_dollar_amt_per_hour_v2'
    ], 
    axis=1
)

# Target

In [10]:
TARGET = 'register__sales_dollar_amt_this_hour'

In [11]:
train[TARGET].min(), train[TARGET].max()

(15.71, 4978.65)

# Features
[Data dictionary](https://docs.google.com/spreadsheets/d/1a_Zw27zQfEZF_GnDqXuoYnoOiyrgOcasw0DIUTAhrr4/edit#gid=1521074975)

### observation_id

In [13]:
train.shape[0], train['observation_id'].nunique()

(18512, 18512)

In [14]:
test.shape[0], test['observation_id'].nunique()

(5179, 5179)

In [15]:
set(train['observation_id'].unique()).intersection(set(test['observation_id'].unique()))

set()

Definitely a unique identifier. Can be dropped from feature set for modeling.

In [16]:
train = train.drop(labels='observation_id', axis=1)
test = test.drop(labels='observation_id', axis=1)

### hour_of_day

In [17]:
train['hour_of_day'].min(), train['hour_of_day'].max()

(0, 23)

In [18]:
test['hour_of_day'].min(), test['hour_of_day'].max()

(0, 23)

### observation_timestamp

In [19]:
train[['observation_timestamp', 'hour_of_day']].sample(5, random_state=SEED)

Unnamed: 0,observation_timestamp,hour_of_day
8125,2022-06-04 07:12:50,7
17419,2022-05-21 20:50:26,20
1859,2022-05-16 21:22:43,21
14474,2022-05-21 01:52:23,1
12401,2022-05-17 20:49:20,20


In [20]:
train['observation_timestamp'] = pd.to_datetime(train['observation_timestamp'])
test['observation_timestamp'] = pd.to_datetime(test['observation_timestamp'])

In [21]:
train.observation_timestamp.min(), train.observation_timestamp.max()

(Timestamp('2022-04-07 21:46:33'), Timestamp('2022-07-29 21:15:48'))

In [22]:
test.observation_timestamp.min(), test.observation_timestamp.max()

(Timestamp('2022-01-06 00:04:00'), Timestamp('2022-12-05 23:56:00'))

In [23]:
len(test.loc[(test.observation_timestamp > '2022-04-07 21:46:33') 
             & (test.observation_timestamp < '2022-07-29 21:15:48')])

3594

* Train dataset records only cover early April to late July 2022, while Test dataset records cover early January to early December 2022.  
We have training data for summer sales but it cannot help predict winter sales.  
* However, 3594 out of 5178 test dataset records fall within our train data range. Thus, there is still some value in the timestamp data.

In [24]:
train['month'] = train.observation_timestamp.dt.month.astype('int')
test['month'] = test.observation_timestamp.dt.month.astype('int')

In [25]:
train['day'] = train.observation_timestamp.dt.day.astype('int')
test['day'] = test.observation_timestamp.dt.day.astype('int')

In [26]:
train['day_of_week'] = train.observation_timestamp.dt.day_of_week.astype('int')
test['day_of_week'] = test.observation_timestamp.dt.day_of_week.astype('int')

### object columns

In [27]:
train.describe(include='O').T

Unnamed: 0,count,unique,top,freq
register__payment_types_accepted,18512,3,Cash+Credit,16310
store__type_code,18512,6,A,11691


#### register__payment_types_accepted

In [28]:
train.register__payment_types_accepted.value_counts(normalize=True)

Cash+Credit          0.8811
Cash+Credit+Check    0.1152
Credit               0.0037
Name: register__payment_types_accepted, dtype: float64

In [29]:
test.register__payment_types_accepted.value_counts(normalize=True)

Cash+Credit          0.8828
Cash+Credit+Check    0.1128
Credit               0.0044
Name: register__payment_types_accepted, dtype: float64

#### store__type_code

In [30]:
train.store__type_code.value_counts(normalize=True)

A    0.6315
C    0.3045
B    0.0465
D    0.0171
F    0.0003
E    0.0001
Name: store__type_code, dtype: float64

In [31]:
test.store__type_code.value_counts(normalize=True)

A    0.6584
C    0.2807
B    0.0423
D    0.0180
F    0.0004
E    0.0002
Name: store__type_code, dtype: float64

#### Encoding:

In [34]:
train['register__payment_types_accepted'] = train['register__payment_types_accepted'].astype('category')
test['register__payment_types_accepted'] = test['register__payment_types_accepted'].astype('category')

train['store__type_code'] = train['store__type_code'].astype('category')
test['store__type_code'] = test['store__type_code'].astype('category')

### numerical columns 

In [35]:
train.describe(include=['float64']).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
register__sales_dollar_amt_this_hour,18512.0,1169.8142,818.4552,15.71,369.16,1165.795,1457.4875,4978.65
register__peak_sales_dollar_amt_per_hour,18512.0,0.0029,1.0075,-0.9858,-0.6241,-0.3357,0.4694,6.0216
register__sales_dollar_amt_last_hour,18512.0,0.0005,1.1195,-0.127,-0.1267,-0.125,-0.0885,146.3522
register__sales_quantity_last_hour,18512.0,0.0012,1.1134,-0.1994,-0.1976,-0.1686,-0.0206,141.5132
register__sales_quantity_rescanned_frac,18512.0,-0.0144,0.9885,-0.8299,-0.7678,-0.3684,0.3613,7.8503
register__sales_payments_declined_frac,18512.0,0.0015,1.0371,-0.1247,-0.1247,-0.1247,-0.1247,55.3798
register__peak_returns_dollar_amt_per_hour,18512.0,0.0067,1.0104,-0.5852,-0.5251,-0.5251,-0.0236,5.6865
register__returns_dollar_amt_last_hour,18512.0,0.0031,1.107,-0.2582,-0.258,-0.2475,0.0366,136.4912
register__returns_quantity_last_hour,18512.0,0.0026,1.1109,-0.2621,-0.2591,-0.2047,0.0419,139.6
register__returns_quantity_rescanned_frac,18512.0,-0.0242,0.9607,-0.5337,-0.5248,-0.3982,-0.0657,9.6933


* A lot of negative values where we would not expect them. For instance, median 'sales_dollar_amount_last_hour' is negative. We would expect any kind of sales amount to be non-negative.  
* Except the target variable, the other numerical columns have zero mean and unit std. deviation, which means they are preprocessed (standardized). Thus, we cannot derive any insights from the original data. We can only do further preprocessing such as outlier removal for sensitive algorithms.

# Baseline

In [36]:
features = list(test.columns)
features.remove('observation_timestamp')

In [60]:
def regression_accuracy(y_meas, y_pred, max_error=20, error_type='relative'):
    '''Compares predicted & measured values, returning the percentage of predictions
       that are within a set error limit. This error limit can be an absolute value
       or a relative percentage'''

    # OPTION 1: Relative percentage
    if ( error_type == 'relative' ):
        mask = 100.0 * abs((y_pred - y_meas) / y_meas) < max_error

    # OPTION 2: Absolute value
    if ( error_type == 'absolute' ):
        mask = abs(y_pred - y_meas) < max_error

    accuracy = sum(mask) / len(mask)

    return -accuracy # '-' for xgboost eval-metric usage 

In [61]:
%%time
scores_racc = []
scores_mae = []
test_preds = []

X = train[features]
y = train[TARGET]

baseline = XGBRegressor(
    n_estimators=1000,
    learning_rate = 0.02,
    max_depth=10,
    tree_method='gpu_hist',
    enable_categorical=True,
    max_cat_to_onehot=6,
    predictor='gpu_predictor',
    eval_metric=regression_accuracy,
    early_stopping_rounds=50,
    seed=SEED
)

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
    baseline.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=50
    )
    preds_val = baseline.predict(X_val)
    racc = regression_accuracy(y_val, preds_val)
    scores_racc.append(racc)
    mae = mean_absolute_error(y_val, preds_val)
    scores_mae.append(mae)
    test_preds.append(baseline.predict(test[features]))
    print(f'Fold #{fold}: R-Acc = {racc:.5f}, MAE = {mae:.5f}')

print(f'\nAvg. R-Acc = {np.mean(scores_racc):.5f} +/- {np.std(scores_racc):.5f}')
print(f'Avg. MAE = {np.mean(scores_mae):.5f} +/- {np.std(scores_mae):.5f}')

[0]	validation_0-rmse:1402.60286	validation_0-regression_accuracy:-0.00000
[50]	validation_0-rmse:548.05768	validation_0-regression_accuracy:-0.05887
[100]	validation_0-rmse:266.91579	validation_0-regression_accuracy:-0.78396
[150]	validation_0-rmse:195.50707	validation_0-regression_accuracy:-0.86335
[200]	validation_0-rmse:180.72904	validation_0-regression_accuracy:-0.86660
[218]	validation_0-rmse:179.51492	validation_0-regression_accuracy:-0.86633
Fold #0: R-Acc = -0.86821, MAE = 112.67949
[0]	validation_0-rmse:1397.52647	validation_0-regression_accuracy:-0.00000
[50]	validation_0-rmse:552.52304	validation_0-regression_accuracy:-0.06940
[100]	validation_0-rmse:284.57429	validation_0-regression_accuracy:-0.78396
[150]	validation_0-rmse:225.57809	validation_0-regression_accuracy:-0.86740
[200]	validation_0-rmse:215.18215	validation_0-regression_accuracy:-0.86740
[226]	validation_0-rmse:213.34882	validation_0-regression_accuracy:-0.86740
Fold #1: R-Acc = -0.87011, MAE = 113.21467
[0]	va

In [64]:
preds_df = pd.DataFrame(np.column_stack(test_preds), columns=[f'Fold_{i}' for i in range(5)])
preds_df

Unnamed: 0,Fold_0,Fold_1,Fold_2,Fold_3,Fold_4
0,1716.9926,1684.6437,1705.6852,1764.9753,1798.0679
1,352.2997,352.0203,351.2715,357.1324,350.3503
2,1368.2958,1375.0820,1361.2714,1410.5143,1376.3406
3,297.1163,290.6902,274.1450,301.2448,279.4444
4,348.9647,353.2632,349.1398,356.0157,354.6789
...,...,...,...,...,...
5174,1021.2933,1096.0903,1088.5035,1072.3995,1116.0011
5175,1081.6003,1273.4852,1225.4816,1248.3768,1211.2305
5176,849.8782,839.8718,842.3035,800.8390,816.1514
5177,1076.7432,1084.7090,1245.8694,1016.5651,1061.3091


In [65]:
sub_1 = pd.DataFrame({'prediction': preds_df['Fold_1']})
sub_1.to_csv('01_sub_1.csv', index=False)

In [66]:
sub_0 = pd.DataFrame({'prediction': preds_df['Fold_0']})
sub_0.to_csv('01_sub_0.csv', index=False)

In [69]:
avg_preds = np.mean(np.column_stack(test_preds), axis=1)

In [70]:
sub_avg = pd.DataFrame({'prediction': avg_preds})
sub_avg.to_csv('01_sub_avg.csv', index=False)