In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.ensemble import RandomForestRegressor


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv', parse_dates=['date'])
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv', parse_dates=['date'])
holidays = pd.read_csv('/kaggle/input/festivities-in-finland-norway-sweden-tsp-0122/nordic_holidays.csv', parse_dates=['date'])

In [None]:
def train_test_split(df, n):
    df = df.sort_values(['date', 'country', 'store'])
    if isinstance(n, float):
        n = int(df.shape[0] * n)
    return df.iloc[0:df.shape[0] - n], df.iloc[df.shape[0] - n:]

def split_X_y(df, target='num_sold'):
    if 'date' in df.columns:
        df = df.drop('date', axis=1)
    return df.drop(target, axis=1), df[target]

def add_datepart(X, feat, drop=True):     
    X[feat] = pd.to_datetime(X[feat])
    attrs = ['year', 'month', 'day', 'dayofweek', 'quarter', 'dayofyear']
    for attr in attrs:
        X[attr] = getattr(X[feat].dt, attr.lower())
    if drop:
        X = X.drop(feat, axis=1)
    return X

def encode_cat(X, encodings=None):
    if encodings is None:
        encodings = {}
    for col, dtype in X.dtypes.iteritems():
        if dtype == 'object' or dtype == pd.CategoricalDtype:
            X[col] = X[col].astype('category')
            if col in encodings.keys():
                X[col] = X[col].cat.set_categories(encodings[col])
            else:
                encodings[col] = X[col].cat.categories
    return X, encodings

def convert_cats(X):
    for col, dtype in X.dtypes.iteritems():
        if dtype.name == 'category':
            X[col] = X[col].cat.codes
    return X


def print_scores(model, X_train, y_train, X_val, y_val, scorer):
    scores = [model.score(X_train, y_train),
              scorer(y_train, model.predict(X_train)),
              model.score(X_val, y_val),
              scorer(y_val, model.predict(X_val))]
    if hasattr(model, 'oob_score_'):
        scores.append(model.oob_score_)
        
    print(scores)
    
    
    
def SMAPE_score(y_true, y_pred):
    return (100 / len(y_true)) * np.sum((np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2)))

In [None]:
train = add_datepart(train, 'date', False)
train.head()

In [None]:
import holidays
import dateutil.easter as easter

holiday_list = []

for date in holidays.Finland(years=[2014, 2015, 2016, 2017, 2018, 2019, 2020], observed=True).items():
    holiday_list.append([date[0], date[1], "Finland"])
    
for date in holidays.Norway(years=[2014, 2015, 2016, 2017, 2018, 2019, 2020], observed=True).items():
    holiday_list.append([date[0], date[1], "Norway"])
    
for date in holidays.Sweden(years=[2014, 2015, 2016, 2017, 2018, 2019, 2020], observed=True).items():
    if date[1]!='Söndag':
        holiday_list.append([date[0], date[1].replace(", Söndag", ""), "Sweden"])
    
    
# Last week of the year
for year in [2014, 2015, 2016, 2017, 2018, 2019, 2020]:
    for i, day in enumerate(range(24, 32)):
        for country in ['Finland', 'Sweden', 'Norway']:
             holiday_list.append([pd.to_datetime(f"{year}-{12}-{day}").date(), 
                                  f"Last week of the year (day {i+1})", 
                                  country])
# Swedish Rock Concert
for start, end, year in [[4,7,2014],[3,6,2015],[8,11,2016],[7,10,2017],[6,10,2018],[5,8,2019]]:
    for i, day in enumerate(range(start, end+1)):
        holiday_list.append([pd.to_datetime(f"{year}-{6}-{day}").date(), 
                                  f"Swedish Rock Concert (day {i+1})", 
                                  "Sweden"])
        
# Last Wednesday of June
for date in ['2014-06-25', '2015-06-24', '2016-06-29', '2017-06-28', '2018-06-27', '2019-06-26', '2020-06-24']:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"Last Wednesday of June", 
                                  country])
            
# First Sunday of November
for date in ['2014-11-02', '2015-11-1', '2016-11-6', '2017-11-5', '2018-11-4', '2019-11-3', '2020-11-01']:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"First Sunday of November", 
                                  country])
            
# Independence Day of Finland
for year in [2014, 2015, 2016, 2017, 2018, 2019, 2020]:
    holiday_list.append([pd.to_datetime(f"{year}-{12}-{6}").date(), 
                                      f"Independence Day of Finland", 
                                      'Finland'])

# Easter
easter_date = [easter.easter(y) for y in [2014, 2015, 2016, 2017, 2018, 2019, 2020]]
for date in easter_date:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"Easter", 
                                  country])
            


holidays = pd.DataFrame(holiday_list, columns=['date', 'holiday', 'country'])
holidays = holidays.drop_duplicates(['date', 'country'], keep='first')
holidays = holidays.sort_values(['date', 'country'])
holidays['date'] = pd.to_datetime(holidays['date'])

In [None]:
def add_time_since(train, holiday, holiday_column, early_year):
    train = train.sort_values(['date', 'country']).copy()
    latest_date = {}
    
    for country in train.country.unique():
        date = holidays[(holidays[holiday_column] == holiday) &
                        (holidays['date'].dt.year == early_year) &
                        (holidays['country'] == country)].date
        
        if date.shape[0] != 0:
            date = date.values[-1]
        else:
            date = None
            
        latest_date[country] = date
    
    time_since_holiday = []
    
    for date, country, is_holiday in zip(train['date'].values, train['country'].values, train[holiday_column] == holiday):
        if is_holiday:
            latest_date[country] = date
        
        if latest_date[country] != None:
            diff = date - latest_date[country]
            time_since_holiday.append(diff)
            
        else:
            time_since_holiday.append(0)
    
    train[f'days_since_{holiday}'] = time_since_holiday
    
#     return train[f'days_since_{holiday}'].astype('timedelta64[D]')
                                                 
    train[f'days_since_{holiday}'] = train[f'days_since_{holiday}'].astype('timedelta64[D]')
    
    if train[f'days_since_{holiday}'].dtype.name != 'float64':
        train[f'days_since_{holiday}'] = train[f'days_since_{holiday}'].dt.days

    return train
        
def add_time_until(train, holiday, holiday_column, latest_year):
    train = train.sort_values(['date', 'country'], ascending=False).copy()
    upcoming_date = {}
    
    for country in train.country.unique():
        date = holidays[(holidays[holiday_column] == holiday) &
                        (holidays['date'].dt.year == latest_year) &
                        (holidays['country'] == country)].date
        
        if date.shape[0] != 0:
            date = date.values[0]
        else:
            date = None
            
        upcoming_date[country] = date
    
    time_until_holiday = []
    
    for date, country, is_holiday in zip(train['date'].values, train['country'].values, train[holiday_column] == holiday):
        if is_holiday:
            upcoming_date[country] = date
        
        if upcoming_date[country] != None:
            diff = upcoming_date[country] - date
            time_until_holiday.append(diff)
            
        else:
            time_until_holiday.append(0)
    
    train[f'days_until_{holiday}'] = time_until_holiday
    train[f'days_until_{holiday}'] = train[f'days_until_{holiday}'].astype('timedelta64[D]') # * 1.15741e-14
    
    if train[f'days_until_{holiday}'].dtype.name != 'float64':
        train[f'days_until_{holiday}'] = train[f'days_until_{holiday}'].dt.days

    return train

In [None]:
train = pd.merge(train, holidays, how='left', left_on=['country', 'date'], right_on=['country', 'date'])

train['is_holiday'] = train['holiday'].notna()
holidays['is_holiday'] = True

In [None]:
train = add_time_since(train, True, 'is_holiday', 2014)
train = add_time_until(train, True, 'is_holiday', 2019)

In [None]:
train, encodings = encode_cat(train)
train.head()

In [None]:
dev, val = train_test_split(train, test.shape[0])
dev.shape, val.shape

In [None]:
X_train, y_train = split_X_y(dev)
X_val, y_val = split_X_y(val)

X_train = convert_cats(X_train)
X_val = convert_cats(X_val)

In [None]:
y_train = np.log(y_train)
y_val = np.log(y_val)

In [None]:
# Check that validation doesn't overlap with training set

(X_val.row_id > X_train.row_id.max()).all()

In [None]:
%%time

model = RandomForestRegressor(n_estimators=1000, min_samples_leaf=50, n_jobs=-1)
model.fit(X_train, y_train)

print_scores(model, X_train, y_train, X_val, y_val, SMAPE_score)

# Submission

In [None]:
test.head()

In [None]:
test = add_datepart(test, 'date', False)

In [None]:
test = pd.merge(test, holidays, how='left', left_on=['country', 'date'], right_on=['country', 'date'])
test['is_holiday'] = test['holiday'].notna()

test = add_time_since(test, True, 'is_holiday', 2018)
test = add_time_until(test, True, 'is_holiday', 2020)

In [None]:
test, _ = encode_cat(test, encodings)

In [None]:
test = convert_cats(test)
test.head()

In [None]:
test = test.drop('date', axis=1)

In [None]:
preds = model.predict(test)

In [None]:
submission = pd.DataFrame({'row_id': test.row_id,
                           'num_sold': np.exp(preds)})

submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)