# Random Forests Method for the January Tabular Data Challenge
## Step 1. Building the Model

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from IPython.display import display
import pandas as pd
import numpy as np
import holidays

random_state = 3165
store_data = pd.read_csv("data/train.csv", index_col='row_id')
store_data['date'] = pd.to_datetime(store_data['date'], format='%Y-%m-%d')
display(store_data.head())
display(store_data.tail())

Unnamed: 0_level_0,date,country,store,product,num_sold
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


Unnamed: 0_level_0,date,country,store,product,num_sold
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26293,2018-12-31,Sweden,KaggleMart,Kaggle Hat,823
26294,2018-12-31,Sweden,KaggleMart,Kaggle Sticker,250
26295,2018-12-31,Sweden,KaggleRama,Kaggle Mug,1004
26296,2018-12-31,Sweden,KaggleRama,Kaggle Hat,1441
26297,2018-12-31,Sweden,KaggleRama,Kaggle Sticker,388


In [15]:
split_date = '2017-12-31'
store_data_train = store_data.loc[store_data['date'] <= split_date].copy()
store_data_test = store_data.loc[store_data['date'] > split_date].copy()

In [16]:
def create_time_features(df, date_key='date'):
    """
    Author: Rob Mulla, Robin Onsay
    link: https://www.kaggle.com/robikscube/tutorial-time-series-forecasting-with-xgboost
    :param df:
    :param date_key:
    :return:
    """
    df['dayofweek'] = df[date_key].dt.dayofweek
    df['quarter'] = df[date_key].dt.quarter
    df['month'] = df[date_key].dt.month
    df['year'] = df[date_key].dt.year
    df['day_of_year'] = df[date_key].dt.dayofyear
    df['day_of_month'] = df[date_key].dt.day
    df['week_of_year'] = np.int64(df[date_key].dt.isocalendar().week)
    is_holiday = lambda x: (x[date_key] in holidays.CountryHoliday(x['country']))
    df['is_holiday'] = df.apply(is_holiday, axis=1)
    df['is_month_start'] = df[date_key].dt.is_month_start
    df['is_month_end'] = df[date_key].dt.is_month_end
    df['is_quarter_start'] = df[date_key].dt.is_quarter_start
    df['is_quarter_end'] = df[date_key].dt.is_quarter_end
    df['is_year_start'] = df[date_key].dt.is_year_start
    df['is_year_end'] = df[date_key].dt.is_year_end
    df['is_leap_year'] = df[date_key].dt.is_leap_year
    return ['dayofweek', 'quarter',
            'month', 'year',
            'day_of_year', 'day_of_month',
            'week_of_year', 'is_holiday',
            'is_month_start', 'is_month_end',
            'is_quarter_start', 'is_quarter_end',
            'is_year_start', 'is_year_end',
            'is_leap_year']

In [17]:
time_features = create_time_features(store_data_train)
create_time_features(store_data_test)

['dayofweek',
 'quarter',
 'month',
 'year',
 'day_of_year',
 'day_of_month',
 'week_of_year',
 'is_holiday',
 'is_month_start',
 'is_month_end',
 'is_quarter_start',
 'is_quarter_end',
 'is_year_start',
 'is_year_end',
 'is_leap_year']

In [18]:
features = time_features + ['country', 'store', 'product']
X_train = store_data_train[features]
y_train = store_data_train.num_sold
X_val = store_data_test[features]
y_val = store_data_test.num_sold
display(X_train.head())

Unnamed: 0_level_0,dayofweek,quarter,month,year,day_of_year,day_of_month,week_of_year,is_holiday,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,is_leap_year,country,store,product
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,3,1,1,2015,1,1,1,True,True,False,True,False,True,False,False,Finland,KaggleMart,Kaggle Mug
1,3,1,1,2015,1,1,1,True,True,False,True,False,True,False,False,Finland,KaggleMart,Kaggle Hat
2,3,1,1,2015,1,1,1,True,True,False,True,False,True,False,False,Finland,KaggleMart,Kaggle Sticker
3,3,1,1,2015,1,1,1,True,True,False,True,False,True,False,False,Finland,KaggleRama,Kaggle Mug
4,3,1,1,2015,1,1,1,True,True,False,True,False,True,False,False,Finland,KaggleRama,Kaggle Hat


In [19]:
def one_hot_encode(df):
    s = (df.dtypes == 'object')
    object_cols = list(s[s].index)
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_df = pd.DataFrame(OH_encoder.fit_transform(df[object_cols]))
    OH_df.index = df.index
    num_cols = df.drop(object_cols, axis=1)
    df = pd.concat([num_cols, OH_df], axis=1)
    df.columns = df.columns.astype(str)
    return df

In [20]:
X_train = one_hot_encode(X_train)
X_val = one_hot_encode(X_val)
display(X_train)

Unnamed: 0_level_0,dayofweek,quarter,month,year,day_of_year,day_of_month,week_of_year,is_holiday,is_month_start,is_month_end,...,is_year_end,is_leap_year,0,1,2,3,4,5,6,7
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,1,1,2015,1,1,1,True,True,False,...,False,False,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,3,1,1,2015,1,1,1,True,True,False,...,False,False,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,3,1,1,2015,1,1,1,True,True,False,...,False,False,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,3,1,1,2015,1,1,1,True,True,False,...,False,False,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,3,1,1,2015,1,1,1,True,True,False,...,False,False,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19723,6,4,12,2017,365,31,52,True,False,True,...,True,False,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
19724,6,4,12,2017,365,31,52,True,False,True,...,True,False,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
19725,6,4,12,2017,365,31,52,True,False,True,...,True,False,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
19726,6,4,12,2017,365,31,52,True,False,True,...,True,False,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


## Step 2. Train the Model

In [21]:
learning_rate = 0.001
forest_model = RandomForestRegressor(random_state=random_state)
forest_model.fit(X_train, y_train)
predictions = forest_model.predict(X_val)

## Step 3. Validate the Model

In [22]:
def smape(A, F):
    """
    Thanks https://stackoverflow.com/questions/51444630/how-to-use-smape-evaluation-metric-on-train-dataset and https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error
    """
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

print(f"SMAPE: {smape(y_val, predictions)}")

SMAPE: 8.953587466032578


## Step 4. Choose/Train the Complete Model

In [24]:
forest_model = RandomForestRegressor(random_state=random_state)
X = pd.concat([X_train, X_val], axis=0)
y = pd.concat([y_train, y_val], axis=0)
forest_model.fit(X, y)

RandomForestRegressor(random_state=3165)

## Step 5. Predict Test Values

In [25]:
test_df = pd.read_csv("data/test.csv", index_col='row_id')
test_df['date'] = pd.to_datetime(test_df['date'], format='%Y-%m-%d')
display(test_df.head())
_ = create_time_features(test_df)

Unnamed: 0_level_0,date,country,store,product
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26298,2019-01-01,Finland,KaggleMart,Kaggle Mug
26299,2019-01-01,Finland,KaggleMart,Kaggle Hat
26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker
26301,2019-01-01,Finland,KaggleRama,Kaggle Mug
26302,2019-01-01,Finland,KaggleRama,Kaggle Hat


In [26]:
test_X = test_df[features]
test_X = one_hot_encode(test_X)
display(test_X.head())

Unnamed: 0_level_0,dayofweek,quarter,month,year,day_of_year,day_of_month,week_of_year,is_holiday,is_month_start,is_month_end,...,is_year_end,is_leap_year,0,1,2,3,4,5,6,7
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26298,1,1,1,2019,1,1,1,True,True,False,...,False,False,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
26299,1,1,1,2019,1,1,1,True,True,False,...,False,False,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
26300,1,1,1,2019,1,1,1,True,True,False,...,False,False,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
26301,1,1,1,2019,1,1,1,True,True,False,...,False,False,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
26302,1,1,1,2019,1,1,1,True,True,False,...,False,False,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [27]:
predictions = np.round(forest_model.predict(test_X))
test_df['num_sold'] = np.int64(predictions)
display(test_df.head())
test_df['num_sold'].to_csv('predictions/random_forest_predictions.csv')

Unnamed: 0_level_0,date,country,store,product,dayofweek,quarter,month,year,day_of_year,day_of_month,week_of_year,is_holiday,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,is_leap_year,num_sold
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
26298,2019-01-01,Finland,KaggleMart,Kaggle Mug,1,1,1,2019,1,1,1,True,True,False,True,False,True,False,False,385
26299,2019-01-01,Finland,KaggleMart,Kaggle Hat,1,1,1,2019,1,1,1,True,True,False,True,False,True,False,False,563
26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker,1,1,1,2019,1,1,1,True,True,False,True,False,True,False,False,161
26301,2019-01-01,Finland,KaggleRama,Kaggle Mug,1,1,1,2019,1,1,1,True,True,False,True,False,True,False,False,641
26302,2019-01-01,Finland,KaggleRama,Kaggle Hat,1,1,1,2019,1,1,1,True,True,False,True,False,True,False,False,928
