In [None]:
!pip install holidays

In [None]:
!wget https://gist.githubusercontent.com/creotiv/a9385c95afa076240144a447e050f572/raw/8d5d4326f6c3db12b642a7218a1a1c66b6d6911b/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv

In [None]:
import warnings
warnings.filterwarnings('ignore')
import math
import os
import dateutil.easter as easter
from datetime import datetime, date, timedelta
from collections import defaultdict

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from scipy.signal import periodogram

import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import RidgeCV

import holidays

In [None]:
DIR = "/kaggle/input/tabular-playground-series-jan-2022"

train = pd.read_csv(os.path.join(DIR,'train.csv'))
test = pd.read_csv(os.path.join(DIR,'test.csv'))
gdp = pd.read_csv('./GDP_data_2015_to_2019_Finland_Norway_Sweden.csv').set_index('year')

In [None]:
def find_scale(y, y_t, model, preproc, limits=[0.8,1.2], threshold=0, step=0.01, correction=False):
    i = limits[0]
    scores = []
    scales = []
    while i <= limits[1]:
        scale = i
        test_pred_list = []
        res = np.exp(model.predict(preproc.transform(y[features])))
        if correction:
            res *= scale
        test_pred_list.append(res)
        sub = pd.DataFrame()
        sub['num_sold'] = sum(test_pred_list) / len(test_pred_list)
        sub['num_sold'] = sub['num_sold'].apply(lambda x: x*scale if x > threshold and not correction else x)
        sub['num_sold'] = sub['num_sold'].round()
        score = np.mean(smape_loss(y_t['num_sold'], sub['num_sold']))
        scores.append(score)
        scales.append(scale)
        i += step
    best = np.argmin(scores)
    print("SMAPE Best", scores[best], scales[best])
    return scores[best], scales[best]

def smape_loss(y_true, y_pred):
    diff = np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200
    return diff.mean()

def check_missing(df):
    missing = False
    if df.isna().astype('int').max().max() == 1:
        print('NaN values detected.')
        missing = True
    if df.isnull().astype('int').max().max() == 1:
        print('Null values detected.')
        missing == True
    return missing

"""
    residuals - pd.Series(index=date, values=values)
"""
def plot_all_residuals(residuals):
    plt.figure(figsize=(20,6))
    plt.scatter(residuals.index,
                residuals,
                s=1, color='k')
    plt.vlines(pd.date_range(residuals.index.min(), residuals.index.max(), freq='M'),
               plt.ylim()[0], plt.ylim()[1], alpha=0.5)
    plt.vlines(pd.date_range(residuals.index.min(), residuals.index.max(), freq='Y'),
               plt.ylim()[0], plt.ylim()[1], alpha=1 , color='r')
    plt.title('Residuals')
    plt.show()


"""
    ts         - pd.Series(index=doesnt_matter, values=values) | np.array
"""
def plot_periodogram(ts, detrend='linear', ax=None):
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots(figsize=(20,6))
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=90,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

def plot_result(y_true, y_pred):
    plt.figure(figsize=(10, 10))
    plt.scatter(y_true, y_pred, s=1, color='r')
    plt.plot([plt.xlim()[0], plt.xlim()[1]], [plt.xlim()[0], plt.xlim()[1]], '--', color='k')
    plt.gca().set_aspect('equal')
    plt.xlabel('True')
    plt.ylabel('Pred')
    plt.title('Predictions')
    plt.show()

def plot_ts(dd, name):
    plt.figure(figsize=(30, 5))
    for d in dd:
        plt.plot(d['date'],d['num_sold'])
    plt.xlabel('Date')
    plt.ylabel('Sold')
    plt.title(name)
    plt.show()

In [None]:
def make_features_simple(_df, gdp):
    """Return a new dataframe with the engineered features"""
    df = _df.copy()
    df['date'] = pd.to_datetime(df['date'])
    
    def get_gdp(row):
        country = 'GDP_' +row.country # 
        return gdp.loc[row.date.year, country]
        
    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
                           'friday': df.date.dt.weekday == 4, # Friday
                           'weekends': df.date.dt.weekday >= 5, # Saturday and Sunday
                          })

    # One-hot encoding (no need to encode the last categories)
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        new_df[product] = df['product'] == product

    ## Seasonal variations (Fourier series)
    ## The three products have different seasonal patterns
    dayofyear = df.date.dt.dayofyear
    for k in [1,2,3,4,5,6, 7, 14, 21, 28, 30, 31, 91]:
        new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        for product in ['Kaggle Mug', 'Kaggle Hat']:
            name = product.replace(' ','_').lower()
            new_df[f'{name}_sin{k}'] = new_df[f'sin{k}'] * new_df[product]
            new_df[f'{name}_cos{k}'] = new_df[f'cos{k}'] * new_df[product]
        for country in ['Finland', 'Norway']:
            name = country.replace(' ','_').lower()
            new_df[f'{name}_sin{k}'] = new_df[f'sin{k}'] * new_df[country]
            new_df[f'{name}_cos{k}'] = new_df[f'cos{k}'] * new_df[country]

    features = new_df.columns
    new_df[features] = new_df[features].astype(np.float32) 
    # new_df['year'] = df.date.dt.year.astype(int)
    new_df['country'] = df.country
    new_df['product'] = df['product']
    new_df['store'] = df.store
    new_df['date'] = pd.to_datetime(df['date'])
    if 'num_sold' in df.columns:
        new_df['num_sold'] = df.num_sold.astype(np.float32)

    return new_df

def add_holiday(df, name, date, country=None, product=None, store=None, offset=[-3,3]):
    if not isinstance(country, list):
        country = pd.Series([country])
    if not isinstance(product, list):
        product = pd.Series([product])
    if not isinstance(store, list):
        store = pd.Series([store])
    if not isinstance(date, list):
        dates = {y: pd.Timestamp((date)).replace(year=y) for y in df.date.dt.year.unique().astype(int).tolist()}
        dates = df.date.dt.year.map(dates)
    else:
        dates = {pd.Timestamp((d)).year: pd.Timestamp((d)) for d in date}
        dates = df.date.dt.year.map(dates)

    select = (df.date.dt.year > 0)
    if country is not None:
        select = select & (df.country.isin(country))
    if product is not None:
        select = select & (df['product'].isin(product))
    if store is not None:
        select = select & (df.store.isin(store))
    df = pd.concat([df,
        pd.DataFrame({f"{name.lower().replace(' ','-')}{d}": 
             (df.date - dates == np.timedelta64(d, "D")) & select
            for d in list(range(offset[0], offset[1]))})
    ],axis=1)
    return df

def add_holidays(df):
    
    dates = defaultdict(list)
    for date, name in holidays.SE(years=[2015,2016,2017,2018,2019], include_sundays=False).items():
        dates[name].append(str(date))
    for i, (name, dd) in enumerate(dates.items()):
        if len(dd) < 5:
            continue
        df = add_holiday(df, f'se_hol_{i}', dd, ['Sweden'])

    dates = defaultdict(list)
    for date, name in holidays.FI(years=[2015,2016,2017,2018,2019]).items():
        dates[name].append(str(date))
    for i, (name, dd) in enumerate(dates.items()):
        if len(dd) < 5:
            continue
        df = add_holiday(df, f'fin_hol_{i}', dd, ['Finland'])

    dates = defaultdict(list)
    for date, name in holidays.NO(years=[2015,2016,2017,2018,2019], include_sundays=False).items():
        dates[name].append(str(date))
    for i, (name, dd) in enumerate(dates.items()):
        if len(dd) < 5:
            continue
        df = add_holiday(df, f'nor_hol_{i}', dd, ['Norway'])
    
    df = add_holiday(df, f'1st_apr', '2015-04-01', ['Norway','Sweden','Finland'])

    df = add_holiday(df, f'a1', '2016-12-14', ['Finland'])
    df = add_holiday(df, f'a2', '2016-04-05', ['Sweden'])
    df = add_holiday(df, f'a3', '2016-04-03', ['Norway'])
    df = add_holiday(df, f'a4', '2016-04-07', ['Norway'])
    df = add_holiday(df, f'a5', '2016-05-18', ['Norway'])
    df = add_holiday(df, f'a6', '2016-05-27', ['Norway'])
    df = add_holiday(df, f'a7', '2016-05-30', ['Norway'])
    df = add_holiday(df, f'a8', '2016-11-25', ['Norway'])

    # outliers for norway and KaggleHat
    df = add_holiday(df, f'N1', '2019-12-31', ['Norway'],['Kaggle Hat'],['KaggleRama'],[-5,1])
    df = add_holiday(df, f'N2', '2019-06-16', ['Norway'],['Kaggle Hat'],['KaggleRama'],[-1,1])
    df = add_holiday(df, f'N3', '2019-05-05', ['Norway'],['Kaggle Hat'],['KaggleRama'],[-1,1])
    df = add_holiday(df, f'N4', '2019-04-28', ['Norway'],['Kaggle Hat'],['KaggleRama'],[-5,1])
    df = add_holiday(df, f'N5', '2019-04-21', ['Norway'],['Kaggle Hat'],['KaggleRama'],[-1,1])

    return df


def make_features_adv(_df, gdp):
    df = _df.copy()
    df['date'] = pd.to_datetime(df['date'])
    
    new_df = make_features_simple(_df, gdp)
    new_df = add_holidays(new_df)
    
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d)
                                      for d in range(24, 32)}),
                        pd.DataFrame({f"n-dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in range(24, 32)}),
                        pd.DataFrame({f"f-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in range(1, 14)}),
                        pd.DataFrame({f"jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in range(1, 10)}),
                        pd.DataFrame({f"s-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in range(1, 15)})],
                       axis=1)
    
    # May
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) 
                                      for d in list(range(1, 10))}), #  + list(range(17, 25))
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in list(range(19, 26))})],
                       axis=1)
    
    # June and July
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"june{d}":
                                      (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in list(range(8, 14))})
                       ],
                       axis=1)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"wed_june{d}": 
                                      (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                                      for d in list(range(-4, 6))})],
                       axis=1)
    
    new_df['first_half'] = df.date.dt.month <= 6
    
    # # Swedish rock concert
    swed_rock_fest  = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-6')),
                                         2016: pd.Timestamp(('2016-06-11')),
                                         2017: pd.Timestamp(('2017-06-10')),
                                         2018: pd.Timestamp(('2018-06-10')),
                                         2019: pd.Timestamp(('2019-06-8'))})


    new_df = pd.concat([new_df, pd.DataFrame({f"swed_rock_fest{d}":
                                      (df.date - swed_rock_fest == np.timedelta64(d, "D")) & (df.country == 'Sweden')
                                      for d in list(range(-3, 3))})], axis=1)
    
    # First Sunday of November - Daylight Saving Time ends
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"sun_nov{d}": 
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                                      for d in list(range(0, 9))})],
                       axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December)
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})],
                       axis=1)

    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"easter{d}": 
                                      (df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})],
                       axis=1)
    
    new_df.drop(['date','country','store','product'],axis='columns', inplace=True)
    new_df = new_df.astype(np.float32)
    if 'num_sold' in df.columns:
        new_df['date'] = pd.to_datetime(df['date'])

    return new_df

train_df = make_features_adv(train, gdp)
test_df = make_features_adv(test, gdp) 

print(len(train_df.columns),len(test_df.columns))


In [None]:
def fit_model(X_tr, X_va=None,features=None):
    """Scale the data, fit a model, plot the training history and validate the model"""
    start_time = datetime.now()
    # Preprocess the data
    X_tr_f = X_tr[features]
    preproc = StandardScaler()
    X_tr_f = preproc.fit_transform(X_tr_f)
    y_tr = X_tr.num_sold.values.reshape(-1, 1)
    
    # Train the model
    model = RidgeCV(alphas=(0.1,0.2,0.4,0.5,0.7,1,5,10))
    model.fit(X_tr_f, np.log(y_tr).ravel())

    if X_va is not None:
        # Preprocess the validation data
        X_va_f = X_va[features]
        X_va_f = preproc.transform(X_va_f)
        y_va = X_va.num_sold.values.reshape(-1, 1)

        # Inference for validation
        y_va_pred = np.exp(model.predict(X_va_f)).reshape(-1, 1)
        oof.update(pd.Series(y_va_pred.ravel(), index=X_va.index))
        
        # Evaluation: Execution time and SMAPE
        smape = np.mean(smape_loss(y_va, y_va_pred))
        print(f"Fold {fold} | {str(datetime.now() - start_time)[-12:-7]}"
              f" | SMAPE: {smape:.5f}")
        score_list.append(smape)
        
        # Plot y_true vs. y_pred
        plt.figure(figsize=(10, 10))
        plt.scatter(y_va, y_va_pred, s=1, color='r')
        plt.plot([plt.xlim()[0], plt.xlim()[1]], [plt.xlim()[0], plt.xlim()[1]], '--', color='k')
        plt.gca().set_aspect('equal')
        plt.xlabel('y_true')
        plt.ylabel('y_pred')
        plt.title('OOF Predictions')
        plt.show()
        
    return preproc, model


In [None]:
# Make the results reproducible
np.random.seed(202100)

total_start_time = datetime.now()
oof = pd.Series(0.0, index=train_df.index)
score_list = []
kf = GroupKFold(n_splits=4)
for fold, (train_idx, val_idx) in enumerate(kf.split(train_df, groups=train_df.date.dt.year)):
    X_tr = train_df.iloc[train_idx]
    X_va = train_df.iloc[val_idx]
    preproc, model = fit_model(X_tr, X_va, features=test_df.columns)

In [None]:
features =   test_df.columns 

X_tr = train_df[train_df.date.dt.year<=2017]
y = train_df[(train_df.date.dt.year>2017) & (train_df.date.dt.year<2019)].reset_index()[features]
y_t = train_df[(train_df.date.dt.year>2017) & (train_df.date.dt.year<2019)].reset_index()
preproc, model = fit_model(X_tr, None, features=features)

sub = pd.DataFrame()
sub['num_sold'] = np.exp(model.predict(preproc.transform(y[features])))
sub['num_sold'] = sub['num_sold'].round()
print("SMAPE SUB: ",np.mean(smape_loss(y_t['num_sold'], sub['num_sold'])))
y_t['diff'] = np.abs(y_t['num_sold']-sub['num_sold'])
y_t.plot(y='diff',x='num_sold',kind='scatter')
plt.show()

_,scale = find_scale(y, y_t, model, preproc, correction=1)

sub = pd.DataFrame()
sub['num_sold'] = np.exp(model.predict(preproc.transform(y[features]))) * scale
sub['num_sold'] = sub['num_sold'].round()
y_t['diff'] = np.abs(y_t['num_sold']-sub['num_sold'])
y_t.plot(y='diff',x='num_sold',kind='scatter')
plt.show()

X_tr = train_df
y = test_df[features]
preproc, model = fit_model(X_tr, None, features=features)
sub = test[['row_id']].copy()
sub['num_sold'] = res = np.exp(model.predict(preproc.transform(y[features]))) * scale
sub['num_sold'] = sub['num_sold'].round()
sub.to_csv('submission.csv', index=False)

