# 1. Load packages and datasets

In [102]:
import numpy as np
import pandas as pd
import datetime

In [2]:
air_reserve = pd.read_csv('data/air_reserve.csv', parse_dates=['reserve_datetime', 'visit_datetime'])
hpg_reserve = pd.read_csv('data/hpg_reserve.csv', parse_dates=['reserve_datetime', 'visit_datetime'])
air_store_info = pd.read_csv('data/air_store_info.csv')
hpg_store_info = pd.read_csv('data/hpg_store_info.csv')
store_id_relation = pd.read_csv('data/store_id_relation.csv')
air_visit_data = pd.read_csv('data/air_visit_data.csv', parse_dates=['visit_date'])
date_info = pd.read_csv('data/date_info.csv', parse_dates=['calendar_date'])
sample_df = pd.read_csv('data/sample_submission.csv')
sample_df_new = pd.read_csv('data/to_be_predicted.csv', parse_dates=['date'])

# air_visit_data pivited and filtered for IDs that are in the submission file
air_visit_data_pivoted = air_visit_data[air_visit_data.air_store_id.isin(sample_df_new.air_store_id)].pivot(columns='visit_date', index='air_store_id', values='visitors')

# 2. Prepare data

In [3]:
air_visit_data_pivoted

visit_date,2016-01-01,2016-01-02,2016-01-03,2016-01-04,2016-01-05,2016-01-06,2016-01-07,2016-01-08,2016-01-09,2016-01-10,...,2017-04-13,2017-04-14,2017-04-15,2017-04-16,2017-04-17,2017-04-18,2017-04-19,2017-04-20,2017-04-21,2017-04-22
air_store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
air_00a91d42b08b08d9,,,,,,,,,,,...,34.0,39.0,,,19.0,35.0,17.0,38.0,55.0,18.0
air_0164b9927d20bcc3,,,,,,,,,,,...,13.0,7.0,1.0,,2.0,1.0,8.0,1.0,26.0,6.0
air_0241aa3964b7f861,,,10.0,9.0,17.0,10.0,,5.0,8.0,16.0,...,,4.0,15.0,10.0,12.0,19.0,8.0,,3.0,13.0
air_0328696196e46f18,,,,,,,,,,,...,,9.0,4.0,3.0,3.0,,24.0,,19.0,8.0
air_034a3d5b40d5b1b1,,,,,,,,,,,...,22.0,18.0,31.0,39.0,25.0,20.0,31.0,12.0,37.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
air_fea5dc9594450608,,,,,,,,,,,...,17.0,20.0,20.0,15.0,9.0,16.0,17.0,23.0,28.0,14.0
air_fee8dcf4d619598e,,,,,,,,,,,...,11.0,37.0,47.0,32.0,15.0,22.0,32.0,26.0,27.0,53.0
air_fef9ccb3ba0da2f7,,,,,,,,,,,...,6.0,1.0,23.0,9.0,7.0,,13.0,1.0,3.0,5.0
air_ffcc2d5087e1b476,,,,,,,,,,,...,16.0,37.0,25.0,,14.0,28.0,28.0,23.0,54.0,1.0


## 2.1. Replace NaNs

In [4]:
air_visit_data

Unnamed: 0,air_store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29
3,air_ba937bf13d40fb24,2016-01-16,22
4,air_ba937bf13d40fb24,2016-01-18,6
...,...,...,...
252103,air_24e8414b9b07decb,2017-04-18,6
252104,air_24e8414b9b07decb,2017-04-19,6
252105,air_24e8414b9b07decb,2017-04-20,7
252106,air_24e8414b9b07decb,2017-04-21,8


In [61]:
air_visit_data_pivoted_t = air_visit_data_pivoted.T.sort_index()

In [60]:
def replace_nan(time_series):
    first_idx = time_series.first_valid_index()
    # print(first_idx, type(first_idx))

    pre_first_idx = time_series[:(pd.to_datetime(first_idx)-datetime.timedelta(days=1))]
    post_first_idx = time_series[first_idx:]
    post_first_idx = post_first_idx.fillna(0)
    
    if len(pre_first_idx) == 0:
        return post_first_idx
    
    return pd.concat([pre_first_idx, post_first_idx])


In [62]:
# replace NaNs
air_visit_replaced = air_visit_data_pivoted_t.apply(lambda col: replace_nan(col), axis=0)

In [79]:
air_visit_replaced.head()

air_store_id,air_00a91d42b08b08d9,air_0164b9927d20bcc3,air_0241aa3964b7f861,air_0328696196e46f18,air_034a3d5b40d5b1b1,air_036d4f1ee7285390,air_0382c794b73b51ad,air_03963426c9312048,air_04341b588bde96cd,air_049f6d5b402a31b2,...,air_fd6aac1043520e83,air_fdc02ec4a3d21ea4,air_fdcfef8bd859f650,air_fe22ef5a9cbef123,air_fe58c074ec1445ea,air_fea5dc9594450608,air_fee8dcf4d619598e,air_fef9ccb3ba0da2f7,air_ffcc2d5087e1b476,air_fff68b929994bfbd
visit_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01,,,,,,,,,10.0,,...,,,,21.0,,,,,,
2016-01-02,,,,,,,,,0.0,,...,,,,37.0,,,,,,
2016-01-03,,,10.0,,,,,,0.0,,...,,,,14.0,,,,,,
2016-01-04,,,9.0,,,,,62.0,23.0,,...,28.0,,,0.0,,,,,,
2016-01-05,,,17.0,,,,,30.0,35.0,,...,36.0,,,0.0,32.0,,,,,


# 2.2. Train test split

In [80]:
air_visit_replaced_pivoted = air_visit_replaced.T
air_visit_replaced_pivoted.head()

visit_date,2016-01-01,2016-01-02,2016-01-03,2016-01-04,2016-01-05,2016-01-06,2016-01-07,2016-01-08,2016-01-09,2016-01-10,...,2017-04-13,2017-04-14,2017-04-15,2017-04-16,2017-04-17,2017-04-18,2017-04-19,2017-04-20,2017-04-21,2017-04-22
air_store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
air_00a91d42b08b08d9,,,,,,,,,,,...,34.0,39.0,0.0,0.0,19.0,35.0,17.0,38.0,55.0,18.0
air_0164b9927d20bcc3,,,,,,,,,,,...,13.0,7.0,1.0,0.0,2.0,1.0,8.0,1.0,26.0,6.0
air_0241aa3964b7f861,,,10.0,9.0,17.0,10.0,0.0,5.0,8.0,16.0,...,0.0,4.0,15.0,10.0,12.0,19.0,8.0,0.0,3.0,13.0
air_0328696196e46f18,,,,,,,,,,,...,0.0,9.0,4.0,3.0,3.0,0.0,24.0,0.0,19.0,8.0
air_034a3d5b40d5b1b1,,,,,,,,,,,...,22.0,18.0,31.0,39.0,25.0,20.0,31.0,12.0,37.0,35.0


In [116]:
# focus on data after July 2016
air_visit_replaced_pivoted_july = air_visit_replaced_pivoted.loc[:, '2016-07-01':].dropna()
air_visit_replaced_pivoted_july.shape

(763, 296)

In [117]:
# train test split

train_size = round(air_visit_replaced_pivoted_july.shape[1] * .75)
test_size = air_visit_replaced_pivoted_july.shape[1] - train_size

print(f'train_size = {train_size}, test_size = {test_size}')


train_size = 222, test_size = 74


In [123]:
train_x = air_visit_replaced_pivoted_july.iloc[:, :train_size-1]
train_y = air_visit_replaced_pivoted_july.iloc[:, train_size-1:train_size]
test_x = air_visit_replaced_pivoted_july.iloc[:, train_size:-1]
test_y = air_visit_replaced_pivoted_july.iloc[:, -1:]

## 2.3. Create weekly data

In [124]:
def make_weekly(x, y):
    
    from datetime import timedelta
    
    date_list = []
    date = y.columns[0]

    while date > x.columns.min():
        date = date - timedelta(days=7)
        if date >= x.columns.min():
            date_list.append(date)

    x_weekly = x.filter(date_list, axis=1)

    return x_weekly

In [125]:
train_x_weekly = make_weekly(train_x, train_y)
test_x_weekly = make_weekly(test_x, test_y)

In [126]:
train_x_weekly.head()

visit_date,2017-01-31,2017-01-24,2017-01-17,2017-01-10,2017-01-03,2016-12-27,2016-12-20,2016-12-13,2016-12-06,2016-11-29,...,2016-09-06,2016-08-30,2016-08-23,2016-08-16,2016-08-09,2016-08-02,2016-07-26,2016-07-19,2016-07-12,2016-07-05
air_store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
air_00a91d42b08b08d9,28.0,29.0,24.0,37.0,0.0,0.0,37.0,31.0,29.0,7.0,...,27.0,17.0,23.0,12.0,27.0,24.0,16.0,25.0,24.0,25.0
air_0241aa3964b7f861,0.0,4.0,14.0,12.0,0.0,13.0,6.0,7.0,34.0,3.0,...,13.0,12.0,8.0,10.0,10.0,13.0,3.0,3.0,2.0,7.0
air_034a3d5b40d5b1b1,1.0,0.0,0.0,0.0,3.0,2.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
air_036d4f1ee7285390,31.0,9.0,17.0,12.0,0.0,11.0,11.0,11.0,21.0,12.0,...,26.0,41.0,40.0,0.0,32.0,12.0,23.0,34.0,41.0,34.0
air_0382c794b73b51ad,26.0,16.0,24.0,20.0,0.0,30.0,27.0,17.0,25.0,19.0,...,14.0,19.0,16.0,18.0,20.0,20.0,19.0,11.0,20.0,21.0


In [127]:
test_x_weekly.head()

visit_date,2017-04-15,2017-04-08,2017-04-01,2017-03-25,2017-03-18,2017-03-11,2017-03-04,2017-02-25,2017-02-18,2017-02-11
air_store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
air_00a91d42b08b08d9,0.0,9.0,7.0,7.0,4.0,9.0,18.0,11.0,18.0,0.0
air_0241aa3964b7f861,15.0,4.0,7.0,14.0,13.0,11.0,6.0,8.0,15.0,6.0
air_034a3d5b40d5b1b1,31.0,40.0,20.0,15.0,33.0,31.0,29.0,51.0,23.0,26.0
air_036d4f1ee7285390,22.0,15.0,7.0,23.0,25.0,20.0,27.0,22.0,33.0,4.0
air_0382c794b73b51ad,35.0,35.0,40.0,30.0,36.0,37.0,43.0,39.0,33.0,43.0


# 3. Model building

In [128]:
from sklearn.tree import DecisionTreeRegressor

In [129]:
model_tree = DecisionTreeRegressor()

In [130]:
model_tree.fit(train_x_weekly, train_y)

In [133]:
test_y

visit_date,2017-04-22
air_store_id,Unnamed: 1_level_1
air_00a91d42b08b08d9,18.0
air_0241aa3964b7f861,13.0
air_034a3d5b40d5b1b1,35.0
air_036d4f1ee7285390,23.0
air_0382c794b73b51ad,40.0
...,...
air_fea5dc9594450608,14.0
air_fee8dcf4d619598e,53.0
air_fef9ccb3ba0da2f7,5.0
air_ffcc2d5087e1b476,1.0


In [134]:
# TODO: Do I use dates as features (lagged) or IDs as features????????

In [131]:
pred_tree = model_tree.predict(test_x_weekly, test_y)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().