# Libraries

In [None]:
import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 999)

import matplotlib.pyplot as plt
import seaborn as sns

import itertools
import math

import h2o
from h2o.automl import H2OAutoML

# Load the data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
train['date'] = pd.to_datetime(train.date)
print(train.info())
train.head()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
test['date'] = pd.to_datetime(test.date)
print(test.info())
test.head()

In [None]:
holiday_df = pd.read_csv('../input/public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv')
print(holiday_df.info())
holiday_df.head()

In [None]:
gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
print(gdp_df.info())
gdp_df.head()

In [None]:
weather_df = pd.read_csv('../input/finland-norway-and-sweden-weather-data-20152019/nordics_weather.csv')
weather_df['date'] = pd.to_datetime(weather_df.date)
print(weather_df.info())
weather_df.head()

# Competition Metric
https://www.kaggle.com/cpmpml/smape-weirdness

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

# Summary

In [None]:
# sns.relplot(data=train, x='date', y='num_sold', row='country', col='store', hue='product',
#             aspect=3, height=2.5, kind='line')

# Baseline

In [None]:
h2o.init(min_mem_size="14G")

In [None]:
H_train = h2o.H2OFrame(train)
H_test = h2o.H2OFrame(test)

In [None]:
target = 'num_sold'

features = [f for f in train.columns if f != target]
features.remove('row_id')
features.remove('date')

from h2o.automl import H2OAutoML 

aml = H2OAutoML(max_runtime_secs = 120, 
                seed = 42,
                stopping_metric='MAE') 

aml.train(x=features,
          y=target,
          training_frame=H_train) 

lb = aml.leaderboard 
lb.head(5)

# Feature Engineering


## Weather
https://www.kaggle.com/adamwurdits/finland-norway-and-sweden-weather-data-20152019
https://www.kaggle.com/adamwurdits/tps-01-2022-weather-eda

In [None]:
print(train.shape, train.isna().sum().sum())
train = train.merge(weather_df, on=['country', 'date'], how='left')
print(train.shape, train.isna().sum().sum())

print(test.shape, test.isna().sum().sum())
test = test.merge(weather_df, on=['country', 'date'], how='left')
print(test.shape, test.isna().sum().sum())

In [None]:
H_train = h2o.H2OFrame(train)
H_test = h2o.H2OFrame(test)

features = [f for f in train.columns if f != target]
features.remove('row_id')
features.remove('date')

aml = H2OAutoML(max_runtime_secs = 360, 
                seed = 42,
                stopping_metric='MAE') 

aml.train(x=features,
          y=target,
          training_frame=H_train) 

lb = aml.leaderboard 
lb.head(5)

## Holidays
https://www.kaggle.com/maxencefzr/tps-jan22-catboost-using-pycaret


In [None]:
# https://www.kaggle.com/maxencefzr/tps-jan22-catboost-using-pycaret
import dateutil.easter as easter

def holiday_features(holiday_df, df):
    
    fin_holiday = holiday_df.loc[holiday_df.country == 'Finland']
    swe_holiday = holiday_df.loc[holiday_df.country == 'Sweden']
    nor_holiday = holiday_df.loc[holiday_df.country == 'Norway']
    
    df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)
    
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df['days_from_easter'] = (df.date - easter_date).dt.days.clip(-5, 65)
    
    # Last Sunday of May (Mother's Day)
    sun_may_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-5-31')),
        2016: pd.Timestamp(('2016-5-29')),
        2017: pd.Timestamp(('2017-5-28')),
        2018: pd.Timestamp(('2018-5-27')),
        2019: pd.Timestamp(('2019-5-26'))
    })
    #new_df['days_from_sun_may'] = (df.date - sun_may_date).dt.days.clip(-1, 9)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-06-24')),
        2016: pd.Timestamp(('2016-06-29')),
        2017: pd.Timestamp(('2017-06-28')),
        2018: pd.Timestamp(('2018-06-27')),
        2019: pd.Timestamp(('2019-06-26'))
    })
    df['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    # First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-11-1')),
        2016: pd.Timestamp(('2016-11-6')),
        2017: pd.Timestamp(('2017-11-5')),
        2018: pd.Timestamp(('2018-11-4')),
        2019: pd.Timestamp(('2019-11-3'))
    })
    df['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    return df

print(train.shape, train.isna().sum().sum())
train = holiday_features(holiday_df, train)
print(train.shape, train.isna().sum().sum())

print(test.shape, test.isna().sum().sum())
test  = holiday_features(holiday_df, test)
print(test.shape, test.isna().sum().sum())

In [None]:
H_train = h2o.H2OFrame(train)
H_test = h2o.H2OFrame(test)

features = [f for f in train.columns if f != target]
features.remove('row_id')
features.remove('date')

aml = H2OAutoML(max_runtime_secs = 360, 
                seed = 42,
                stopping_metric='MAE') 

aml.train(x=features,
          y=target,
          training_frame=H_train) 

lb = aml.leaderboard 
lb.head(5)

# Date features

In [None]:
def new_date_features(df):
    df['year'] = df.date.dt.year 
    df['quarter'] = df.date.dt.quarter
    df['month'] = df.date.dt.month  
    df['week'] = df.date.dt.week 
    df['day'] = df.date.dt.day  
    df['weekday'] = df.date.dt.weekday
    df['day_of_week'] = df.date.dt.dayofweek  
    df['day_of_year'] = df.date.dt.dayofyear  
    df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_month'] = df.date.dt.days_in_month  
    df['is_weekend'] = np.where((df['weekday'] == 5) | (df['weekday'] == 6), 1, 0)
    df['is_friday'] = np.where((df['weekday'] == 4), 1, 0)
    
#     df.drop('date', axis=1, inplace=True)
    
    return df
    
print(train.shape, train.isna().sum().sum())
train = new_date_features(train)
print(train.shape, train.isna().sum().sum())

print(test.shape, test.isna().sum().sum())
test  = new_date_features(test)
print(test.shape, test.isna().sum().sum())

In [None]:
H_train = h2o.H2OFrame(train)
H_test = h2o.H2OFrame(test)

features = [f for f in train.columns if f != target]
features.remove('row_id')
features.remove('date')

aml = H2OAutoML(max_runtime_secs = 360, 
                seed = 42,
                stopping_metric='MAE') 

aml.train(x=features,
          y=target,
          training_frame=H_train) 

lb = aml.leaderboard 
lb.head(5)

# H2O Final

In [None]:
H_train = h2o.H2OFrame(train)
H_test = h2o.H2OFrame(test)

target = 'num_sold'

features = [f for f in train.columns if f != target]
features.remove('row_id')

H_train['fold'] = H_train['year'] - 2015
H_train.summary()

from h2o.automl import H2OAutoML 

aml = H2OAutoML(max_runtime_secs = 10 * 3600, 
                seed = 42,
                stopping_metric='MAE') 

aml.train(x=features,
          y=target,
          training_frame=H_train,
          fold_column='fold') 

lb = aml.leaderboard 
lb.head()

# Submission

In [None]:
test['num_sold'] = np.round(aml.predict(H_test).as_data_frame().values)
submission = test[['row_id', 'num_sold']].copy()
submission.to_csv('submission.csv', index=False)