**Model Version 1 - Linear Regression**
1. Fourier Features + Trend
2. Fourier Features x [Country,Product,Store] wise

**Model Version 2 - Linear Regression**
1. Year & Monthly bias feature added

**Model Version 3 - Linear Regression**
1. GDP based pseudo num_sold included as seperate feature 
(feature added)

**Model Version 4 - Linear Regression + Gradient Boost**
1. Feature Set for Gradient boosting is smaller subset of that used for Linear Regression 

**Model Version 5 - Linear Regression + Gradient Boost**
1. Added Holiday Feature from @Ambrosm
2. Removed 29th Feb in 2016

**Importing Libraries**

In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
from sklearn.preprocessing import OneHotEncoder
from datetime import date
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



**Import data**
- Train & Test data
- GDP data 
- Holiday data (@AMBROSM)

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')
GDP_data = pd.read_csv('/kaggle/input/tps-jan22/GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv')
holiday_data = pd.read_csv('/kaggle/input/tps-jan22/Holidays_Finland_Norway_Sweden_2015-2019.csv')

**Pre-processing data**

In [None]:
# drop 29th Feb in 2016 (leap year)
train.drop(train[train['date']=='2016-02-29'].index, axis=0, inplace=True)

In [None]:
# Concatinating data in order to simplify processing
train_test_df = train.drop('num_sold',axis=1)
train_test_df = pd.concat([train_test_df,test],axis=0)
train_test_df.tail()

In [None]:
# nth day in the dataset(important to learn trend)
train_start_date = date.fromisoformat(train.loc[0,'date'])
train_test_df['date_index'] = train_test_df.date.map(lambda x: (date.fromisoformat(x) - train_start_date).days)

In [None]:
# Encoding
hot_encode = OneHotEncoder()
hot_encode.fit(train_test_df[['country','store','product']])
train_test_df[['Finland','Norway', 'Sweden','KaggleMart','KaggleRama','Kaggle Hat','Kaggle Mug', 'Kaggle Sticker']] = hot_encode.transform(train_test_df[['country','store','product']]).toarray()

In [None]:
train_test_df['date'] = pd.to_datetime(train_test_df.date)
train_test_df = train_test_df.set_index('date')
train_test_df = train_test_df.to_period('D')

In [None]:
## Validation function
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

**Fourier Features to capture Seasonality**

In [None]:
fourier = CalendarFourier(freq="A", order=16)  # 10 sin/cos pairs for "A"nnual seasonality

dp = DeterministicProcess(
    index=train_test_df.index,
    constant=True,               # dummy feature for bias (y-intercept)
    order=1,                     # trend (order 1 means linear)
    seasonal=True,               # weekly seasonality (indicators)
    additional_terms=[fourier],  # annual seasonality (fourier)
    drop=True,                   # drop terms to avoid collinearity
)

fourier_features_df = dp.in_sample()  # create features for dates in tunnel.index
fourier_features_df.head(2)

In [None]:
temp_df = train_test_df.drop(['row_id','country','store','product'],axis=1)
X_Season = pd.concat([fourier_features_df,temp_df],axis=1)
y = train['num_sold']

**Fourier Feature for each [Country, Product, Store] pair** 

In [None]:

for country in list(train.country.unique()):
    for product in list(train['product'].unique()):
        for store in list(train['store'].unique()):
            for fourier_feature in list(fourier_features_df.columns):
                feature_name = country[0:3] +' ' + product.split(' ')[1]+' ' + store[-4:]+ ' ' +fourier_feature 
                #print(feature_name)
                #print(hot_encoded_features[country]*hot_encoded_features[product]*hot_encoded_features[store]*fourier_features_df[fourier_feature])
                X_Season[feature_name] =  list(train_test_df[country]*train_test_df[product]*train_test_df[store]*fourier_features_df[fourier_feature])
                
       

In [None]:
X_Season = X_Season.drop(['Finland','KaggleMart','Kaggle Hat'],axis=1)
X_test = X_Season.loc['2019-01-01':]
X = X_Season.loc[:'2018-12-31']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=False)
X_train.tail(1)

In [None]:
y = train['num_sold']
linear_model_ver1 = LinearRegression(fit_intercept=False)
linear_model_ver1.fit(X_train,y_train)
train_y_pred = linear_model_ver1.predict(X)
test_y_pred = linear_model_ver1.predict(X_test)
val_y_pred = linear_model_ver1.predict(X_val)
train['linear_model_ver1'] = train_y_pred
test['linear_model_ver1'] = test_y_pred

In [None]:
score_ver1 = SMAPE(y_val, val_y_pred)
print('score-',score_ver1)

**Output Plots from model version 1**

In [None]:
grouped = train.groupby(['country','store','product'])
ncols=1
nrows = int(np.ceil(grouped.ngroups/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20,50), sharey=True)
ax_list = axes.flatten()
print(axes.flatten)
#fig, ax = plt.subplots(figsize=(20,10))
cnt = 0
for key, grp in grouped:
    ax = ax_list[cnt]    
    ax.plot(grp['row_id'], grp['num_sold']-grp['linear_model_ver1'], label=key)
    ax.legend()
 
    cnt = cnt+1

**Version 2**

In [None]:
X_Season['year_bias'] = list(X_Season.index.year-2015)
X_Season['month_bias'] = X_Season.index.month + 12*X_Season['year_bias']
for country in list(train.country.unique()):
    for product in list(train['product'].unique()):
        for store in list(train['store'].unique()):
            feature_name = country[0:3] +' ' + product.split(' ')[1]+' ' + store[-4:]+ ' ' +'year_bias'
            feature_name_2 = country[0:3] +' ' + product.split(' ')[1]+' ' + store[-4:]+ ' ' +'month_bias'
            X_Season[feature_name] =  list(train_test_df[country]*train_test_df[product]*train_test_df[store]*X_Season['year_bias'])
            X_Season[feature_name_2] =  list(train_test_df[country]*train_test_df[product]*train_test_df[store]*X_Season['month_bias'])

In [None]:
X_test = X_Season.loc['2019-01-01':]
X = X_Season.loc[:'2018-12-31']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=False)
X_train.tail(1)

In [None]:
y = train['num_sold']
linear_model_ver2 = LinearRegression(fit_intercept=False)
linear_model_ver2.fit(X_train,y_train)
train_y_pred = linear_model_ver2.predict(X)
test_y_pred = linear_model_ver2.predict(X_test)
val_y_pred = linear_model_ver2.predict(X_val)
train['linear_model_ver2'] = train_y_pred
test['linear_model_ver2'] = test_y_pred

In [None]:
score_ver2 = SMAPE(y_val, val_y_pred)
print('previous_score-',score_ver1,'new_score-',score_ver2,'improvement-',-(score_ver2-score_ver1)/score_ver1*100,'%')

In [None]:
grouped = train.groupby(['country','store','product'])
ncols=1
nrows = int(np.ceil(grouped.ngroups/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20,50), sharey=True)
ax_list = axes.flatten()
print(axes.flatten)
cnt = 0
for key, grp in grouped:
    ax = ax_list[cnt]
    ax.plot(grp['row_id'], grp['num_sold']-grp['linear_model_ver1'], label=[key,'ver1'])
    ax.plot(grp['row_id'], grp['num_sold']-grp['linear_model_ver2'], label=[key,'ver2'])
    ax.legend()
 
    cnt = cnt+1

**Version 3**

In [None]:

GDP_data.Finland = GDP_data.Finland.map(lambda x: (x-42802)/42802)
GDP_data.Norway = GDP_data.Norway.map(lambda x: (x-74356)/74356)
GDP_data.Sweden = GDP_data.Sweden.map(lambda x: (x-51545)/51545)
GDP_data.head(5)


# log(GDP) didnt give good results for me
'''
GDP_data.Finland = GDP_data.Finland.map(lambda x: np.log(x))
GDP_data.Norway = GDP_data.Norway.map(lambda x: np.log(x))
GDP_data.Sweden = GDP_data.Sweden.map(lambda x: np.log(x))
GDP_data.head(5)
'''

In [None]:
X_Season['GDP'] = list(train_test_df.reset_index().apply(lambda x:GDP_data[GDP_data['year']==x.date.year][x['country']].values[0],axis=1))
train_test_df['GDP'] = X_Season['GDP']

In [None]:
X_test = X_Season.loc['2019-01-01':]
X = X_Season.loc[:'2018-12-31']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=False)
X_train.tail(1)

In [None]:
train['year'] = train.date.map(lambda x: x.split('-')[0])
num_sold_grp = train.groupby(['country','store','product','year'])['num_sold'].mean()


GDP_based_num_sold_pred = pd.DataFrame(index=['2015','2016','2017','2018','2019'])
for country in list(train.country.unique()):
    for store in list(train['store'].unique()):
        for product in list(train['product'].unique()):
            y_dummy = list(num_sold_grp[country][store][product])[:-1]
            X_dummy = list(GDP_data[country])[:-2]
            #X_dummy = [np.log(i) for i in X_dummy]
            y_dummy = np.array(y_dummy,ndmin=2).transpose()
            y_dummy_true = np.array(list(num_sold_grp[country][store][product]),ndmin=2).transpose()
            X_dummy = np.array(X_dummy,ndmin=2).transpose()
            num_sold_mean_model = LinearRegression(fit_intercept=True)
            num_sold_mean_model.fit(X_dummy,y_dummy)
            X_dummy_test = np.array(list(GDP_data[country]),ndmin=2).transpose()
            y_dummy_pred = num_sold_mean_model.predict(X_dummy_test)
            feature_name = country[0:3]+' ' + store[-4:] +' ' + product.split(' ')[1]
            GDP_based_num_sold_pred[feature_name] = y_dummy_pred
            plt.figure(figsize=(10,10))
            plt.plot(['2015','2016','2017','2018'],y_dummy_true,label=[country,store,product,'True mean of yearly sales'])
            plt.plot(['2015','2016','2017','2018','2019'],y_dummy_pred, label='Predicted mean yearly sales from GDP')
            plt.legend()
            plt.grid()
            plt.show()

In [None]:
train_test_df['year'] = train_test_df.index.year
train_test_df['GDP_based_num_sold'] = train_test_df.apply(lambda x: GDP_based_num_sold_pred[x.country[0:3]+' ' + x.store[-4:] +' ' + x['product'].split(' ')[1]][str(x.year)],axis=1)
X_Season['GDP_based_num_sold'] = train_test_df['GDP_based_num_sold']
X_Season = X_Season.drop('GDP',axis=1)

In [None]:

X_test = X_Season.loc['2019-01-01':]
X = X_Season.loc[:'2018-12-31']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=False)
X_train.tail(1)

In [None]:
y = train['num_sold']
linear_model_ver3 = LinearRegression(fit_intercept=False)
linear_model_ver3.fit(X_train,y_train)
train_y_pred = linear_model_ver3.predict(X)
test_y_pred = linear_model_ver3.predict(X_test)
val_y_pred = linear_model_ver3.predict(X_val)
train['linear_model_ver3'] = train_y_pred
test['linear_model_ver3'] = test_y_pred

In [None]:
grouped = train.groupby(['country','store','product'])
ncols=1
nrows = int(np.ceil(grouped.ngroups/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20,50), sharey=True)
ax_list = axes.flatten()
print(axes.flatten)
cnt = 0
for key, grp in grouped:
    ax = ax_list[cnt]
    ax.plot(grp['row_id'], grp['num_sold']-grp['linear_model_ver1'], label=[key,'ver1'])
    ax.plot(grp['row_id'], grp['num_sold']-grp['linear_model_ver3'], label=[key,'ver3'])
    ax.plot(grp['row_id'], grp['num_sold']-grp['linear_model_ver2'], label=[key,'ver2'])
    ax.legend()
 
    cnt = cnt+1

In [None]:
score_ver3 = SMAPE(y_val, val_y_pred)
print('previous_score-',score_ver2,'new_score-',score_ver3,'improvement-',-(score_ver3-score_ver2)/score_ver2*100,'%')

**Holidays**
1. Dip just before Xmas,arnd 360th day, shows increase in sales compared to normal sales(zero line).
2. After Xmas till new year(360 to 365), peak in error represents drop in sales compared to normal sales(zero line) during this period
3. It becomes important to model before and after effects of holidays, hence, thats the reason most of the topper notebooks have used day-to-holidays as features in the notebook. 

In [None]:

train['year'] = train.date.map(lambda x: int(x.split('-')[0]))
test['year'] = test.date.map(lambda x: int(x.split('-')[0]))
train['month'] = train.date.map(lambda x: int(x.split('-')[1]))
train['day'] = train.date.map(lambda x: int(x.split('-')[2]))
train['dayofyear'] = train.date.map(lambda x: pd.to_datetime(x).dayofyear)

grouped = train[train['year']==2015].groupby(['country','store','product'])
ncols=1
nrows = int(np.ceil(grouped.ngroups/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols*2, figsize=(20,50), sharey=True)
ax_list = axes.flatten()
print(axes.flatten)
cnt = 0


# Plot normalized error month wise 
for key, grp in grouped:
    ax = ax_list[cnt]
    max_value = np.max(grp[grp['month']==12]['num_sold']-grp[grp['month']==12]['linear_model_ver3'])
    ax.plot(grp[grp['month']==12]['dayofyear'], (grp[grp['month']==12]['num_sold']-grp[grp['month']==12]['linear_model_ver3'])/max_value, label=[key,'December'])
    ax.legend() 
    cnt = cnt+1
    ax = ax_list[cnt]
    ax.plot(grp[grp['month']==1]['dayofyear'], (grp[grp['month']==1]['num_sold']-grp[grp['month']==1]['linear_model_ver3'])/max_value, label=[key,'January'])
    ax.legend() 
    cnt = cnt+1


**Features to capture the holiday affect including before and after impact on sales**

In [None]:

def gaussian(x, mu, sig):
    return np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))

X_Season['dec_jan_holdy'] = X_Season.index.day_of_year.map(lambda x: gaussian(x, 363, 5/3) if x>355  else(gaussian(x, 0, 5/3) if x<10  else 0) )
train['dec_jan_holdy'] = train.dayofyear.map(lambda x: gaussian(x, 363, 5/3) if x>355  else(gaussian(x, 0, 5/3) if x<10  else 0) )
X_Season['xmas_pre'] = X_Season.index.day_of_year.map(lambda x: x-350 if ((x>350) & (x<360))  else 0 )
X_Season['xmas_post'] = X_Season.index.day_of_year.map(lambda x: x-359 if ((x>359) & (x<364))  else 0 )
X_Season['newyear'] = X_Season.index.day_of_year.map(lambda x: x-363 if (x>363) else (x+2 if x<7  else 0) )
train['dec_jan_pre_holdy'] = train.dayofyear.map(lambda x: x-350 if ((x>350) & (x<360))  else 0 )

for country in list(train.country.unique()):
    for product in list(train['product'].unique()):
        for store in list(train['store'].unique()):
            feature_name = country[0:3] +' ' + product.split(' ')[1]+' ' + store[-4:]+ ' ' +'dec_holdy'
            X_Season[feature_name] =  list(train_test_df[country]*train_test_df[product]*train_test_df[store]*X_Season['dec_jan_holdy'])


In [None]:
grouped = train.groupby(['country','store','product'])
ncols=1
nrows = int(np.ceil(grouped.ngroups/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols*2, figsize=(20,50), sharey=True)
ax_list = axes.flatten()
print(axes.flatten)
cnt = 0
# Plot normalized error month wise 
for key, grp in grouped:
    #print(grp['num_sold'].min(),grp['num_sold'].max())
    ax = ax_list[cnt]
    max_value = np.max(grp[grp['month']==12]['num_sold']-grp[grp['month']==12]['linear_model_ver3'])
    ax.scatter(grp[grp['month']==12]['dayofyear'], (grp[grp['month']==12]['num_sold']-grp[grp['month']==12]['linear_model_ver3'])/max_value, label=[key,'December'])
    ax.scatter(grp[grp['month']==12]['dayofyear'], (grp[grp['month']==12]['dec_jan_holdy']), label=[key,'December'])
    
    
    cnt = cnt+1
    ax = ax_list[cnt]
    ax.scatter(grp[grp['month']==1]['dayofyear'], (grp[grp['month']==1]['num_sold']-grp[grp['month']==1]['linear_model_ver3'])/max_value, label=[key,'Januarary'])
    ax.scatter(grp[grp['month']==1]['dayofyear'], (grp[grp['month']==1]['dec_jan_holdy']), label=[key,'December'])    
    ax.legend()
 
    cnt = cnt+1

In [None]:
import math
import dateutil.easter as easter

def get_holidays(df):
    # End of year
    df = pd.concat([df, pd.DataFrame({f"dec{d}":
                      (df.date.dt.month == 12) & (df.date.dt.day == d)
                      for d in range(24, 32)}),
        pd.DataFrame({f"n-dec{d}":
                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
                      for d in range(24, 32)}),
        pd.DataFrame({f"f-jan{d}":
                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
                      for d in range(1, 14)}),
        pd.DataFrame({f"jan{d}":
                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
                      for d in range(1, 10)}),
        pd.DataFrame({f"s-jan{d}":
                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
                      for d in range(1, 15)})], axis=1)
    
    # May
    df = pd.concat([df, pd.DataFrame({f"may{d}":
                      (df.date.dt.month == 5) & (df.date.dt.day == d) 
                      for d in list(range(1, 10))}),
        pd.DataFrame({f"may{d}":
                      (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
                      for d in list(range(19, 26))})], axis=1)
    
    # June and July
    df = pd.concat([df, pd.DataFrame({f"june{d}":
                   (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
                   for d in list(range(8, 14))})], axis=1)
    
    #Swedish Rock Concert
    #Jun 3, 2015 – Jun 6, 2015
    #Jun 8, 2016 – Jun 11, 2016
    #Jun 7, 2017 – Jun 10, 2017
    #Jun 6, 2018 – Jun 10, 2018
    #Jun 5, 2019 – Jun 8, 2019
    swed_rock_fest  = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-6')),
                                         2016: pd.Timestamp(('2016-06-11')),
                                         2017: pd.Timestamp(('2017-06-10')),
                                         2018: pd.Timestamp(('2018-06-10')),
                                         2019: pd.Timestamp(('2019-06-8'))})

    df = pd.concat([df, pd.DataFrame({f"swed_rock_fest{d}":
                                      (df.date - swed_rock_fest == np.timedelta64(d, "D")) & (df.country == 'Sweden')
                                      for d in list(range(-3, 3))})], axis=1)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    
    df = pd.concat([df, pd.DataFrame({f"wed_june{d}": 
                   (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                   for d in list(range(-4, 6))})], axis=1)
    
    # First Sunday of November
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    
    df = pd.concat([df, pd.DataFrame({f"sun_nov{d}": 
                   (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                   for d in list(range(0, 9))})], axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December)
    df = pd.concat([df, pd.DataFrame({f"dec{d}":
                   (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                   for d in list(range(6, 14))})], axis=1)

    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df = pd.concat([df, pd.DataFrame({f"easter{d}":
                   (df.date - easter_date == np.timedelta64(d, "D"))
                   for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})], axis=1)
    
    return df

train_copy = train.copy()
train_copy['date'] = pd.to_datetime(train_copy.date)
holidays_train_df = get_holidays(train_copy)
#holidays_train_df = holidays_train_df.set_index('date')

test_copy = test.copy()
test_copy['date'] = pd.to_datetime(test_copy.date)
holidays_test_df = get_holidays(test_copy)
#holidays_test_df = holidays_test_df.set_index('date')

holidays_train_df = holidays_train_df.drop(['row_id', 'date', 'country',  'store',  'product',  'num_sold',
 'linear_model_ver1',  'linear_model_ver2',  'year',  'linear_model_ver3',  'month',
 'day',  'dayofyear',  'dec_jan_holdy',  'dec_jan_pre_holdy'],axis=1)
holidays_test_df = holidays_test_df.drop(['row_id',  'date', 'country',  'store',  'product',  'linear_model_ver1',
 'linear_model_ver2',  'linear_model_ver3',  'year'],axis=1)

In [None]:
holidays_test_df.columns

In [None]:
'''
def unofficial_hol(df):
    countries = {'Finland': 1, 'Norway': 2, 'Sweden': 3}
    stores = {'KaggleMart': 1, 'KaggleRama': 2}
    products = {'Kaggle Mug': 1,'Kaggle Hat': 2, 'Kaggle Sticker': 3}
    
    # load holiday info.
    hol_path = '../input/public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv'
    holiday = pd.read_csv(hol_path)
    
    fin_holiday = holiday.loc[holiday.country == 'Finland']
    swe_holiday = holiday.loc[holiday.country == 'Sweden']
    nor_holiday = holiday.loc[holiday.country == 'Norway']
    df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    
    return df

train_copy = train.copy()
train_copy['date'] = pd.to_datetime(train_copy.date)
train_unofficial_hol = unofficial_hol(train_copy)
train_unofficial_hol = get_interactions(train_unofficial_hol)
train_unofficial_hol = get_GDP(train_unofficial_hol)


#holidays_train_df = holidays_train_df.set_index('date')

test_copy = test.copy()
test_copy['date'] = pd.to_datetime(test_copy.date)
test_unofficial_hol = unofficial_hol(test_copy)
test_unofficial_hol = get_interactions(test_unofficial_hol)
test_unofficial_hol = get_GDP(test_unofficial_hol)
#holidays_test_df = holidays_test_df.set_index('date')


train_unofficial_hol = train_unofficial_hol.drop(['row_id', 'date', 'country',  'store',  'product',  'num_sold',
 'linear_model_ver1',  'linear_model_ver2',  'year',  'linear_model_ver3',  'month',
 'day',  'dayofyear',  'dec_jan_holdy',  'dec_jan_pre_holdy'],axis=1)
test_unofficial_hol = test_unofficial_hol.drop(['row_id',  'date', 'country',  'store',  'product',  'linear_model_ver1',
 'linear_model_ver2',  'linear_model_ver3','year'],axis=1)
 
'''


In [None]:
#list(train_unofficial_hol.columns)

In [None]:
X_test = X_Season.loc['2019-01-01':]
X = X_Season.loc[:'2018-12-31']
holidays_train_df =holidays_train_df.set_index(X.index)
holidays_train_df.astype(np.float32)
#train_unofficial_hol.astype(np.float32)
#train_unofficial_hol =train_unofficial_hol.set_index(X.index)
#X=pd.concat([X,holidays_train_df,train_unofficial_hol],axis=1)
X=pd.concat([X,holidays_train_df],axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=False)

holidays_test_df =holidays_test_df.set_index(X_test.index)
holidays_test_df.astype(np.float32)
#test_unofficial_hol.astype(np.float32)
#test_unofficial_hol =test_unofficial_hol.set_index(X_test.index)
#X_test=pd.concat([X_test,holidays_test_df,test_unofficial_hol],axis=1)
X_test=pd.concat([X_test,holidays_test_df],axis=1)


y = train['num_sold']
linear_model_ver4 = LinearRegression(fit_intercept=False)
linear_model_ver4.fit(X_train,y_train)
train_y_pred = linear_model_ver4.predict(X)
test_y_pred = linear_model_ver4.predict(X_test)
val_y_pred = linear_model_ver4.predict(X_val)
train['linear_model_ver4'] = train_y_pred
test['linear_model_ver4'] = test_y_pred

In [None]:
X_train.head()

In [None]:
grouped = train[train['year']==2015].groupby(['country','store','product'])
ncols=1
nrows = int(np.ceil(grouped.ngroups/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols*2, figsize=(20,50), sharey=True)
ax_list = axes.flatten()
print(axes.flatten)
cnt = 0


# Plot normalized error month wise 
for key, grp in grouped:
    ax = ax_list[cnt]
    max_value = 1 #np.max(grp[grp['month']==12]['num_sold']-grp[grp['month']==12]['linear_model_ver4'])
    max_value_2 = max_value #np.max(grp[grp['month']==12]['num_sold']-grp[grp['month']==12]['linear_model_ver3'])
    ax.plot(grp[grp['month']==12]['dayofyear'], (grp[grp['month']==12]['num_sold']-grp[grp['month']==12]['linear_model_ver4'])/max_value, label=[key,'December ver4'])
    ax.plot(grp[grp['month']==12]['dayofyear'], (grp[grp['month']==12]['num_sold']-grp[grp['month']==12]['linear_model_ver3'])/max_value, label=[key,'December ver3'])
    #ax.scatter(grp[grp['month']==12]['dayofyear'], (grp[grp['month']==12]['dec_jan_holdy']), label=[key,'December'])
    ax.grid()
    cnt = cnt+1
    ax = ax_list[cnt]
    ax.plot(grp[grp['month']==1]['dayofyear'], (grp[grp['month']==1]['num_sold']-grp[grp['month']==1]['linear_model_ver4'])/max_value, label=[key,'Januarary ver4'])
    ax.plot(grp[grp['month']==1]['dayofyear'], (grp[grp['month']==1]['num_sold']-grp[grp['month']==1]['linear_model_ver3'])/max_value, label=[key,'Januarary ver3'])
    #ax.scatter(grp[grp['month']==1]['dayofyear'], (grp[grp['month']==1]['dec_jan_holdy']), label=[key,'December'])    
    ax.legend()
    ax.grid()
 
    cnt = cnt+1

In [None]:
score_ver4 = SMAPE(y_val, val_y_pred)
print('previous_score-',score_ver3,'new_score-',score_ver4,'improvement-',-(score_ver4-score_ver3)/score_ver3*100,'%')

**Version 4 - Gradient Boosting Algorithm to learn the residual errors**

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

lin_reg_pred_X_train = linear_model_ver4.predict(X_train)
lin_reg_pred_X_val = linear_model_ver4.predict(X_val)
lin_reg_pred_X_test = linear_model_ver4.predict(X_test)

#lin_reg_pred_X_train = linear_model_ver3.predict(X_train)
#lin_reg_pred_X_val = linear_model_ver3.predict(X_val)
#lin_reg_pred_X_test = linear_model_ver3.predict(X_test)

# residuals ==> (true_value - linear_model_prediction)
residual_y_train = y_train - lin_reg_pred_X_train
residual_y_val = y_val - lin_reg_pred_X_val

In [None]:
X_Season['month'] = X_Season.index.month
X_Season['day'] = X_Season.index.day
X_Season['year'] = X_Season.index.year


X_boosting = train_test_df[['Finland','Norway', 'Sweden', 'KaggleMart', 'KaggleRama', 'Kaggle Hat','Kaggle Mug', 'Kaggle Sticker', 'year']]
X_boosting.loc[:,'day'] = X_Season['day']
X_boosting.loc[:,'month'] = X_Season.index.month
X_boosting.loc[:,'day_of_year'] = X_Season.index.day_of_year
X_boosting.loc[:,'day_of_week'] = X_Season.index.day_of_week
X_boosting.loc[:,'week']=X_Season.index.weekofyear     # 1 to 53
X_boosting.loc[:,'week']=X_boosting['week'].astype('int')             # int64

X_boosting.tail(1)







In [None]:
X_boosting['dec_hol'] = X_Season.dec_jan_holdy

In [None]:
X_test = X_boosting.loc['2019-01-01':]
X = X_boosting.loc[:'2018-12-31']
holidays_train_df =holidays_train_df.set_index(X.index)
#train_unofficial_hol =train_unofficial_hol.set_index(X.index)
holidays_train_df.astype(np.float32)
#X=pd.concat([X,holidays_train_df],axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=False)
holidays_test_df =holidays_test_df.set_index(X_test.index)
#test_unofficial_hol =test_unofficial_hol.set_index(X_test.index)
holidays_test_df.astype(np.float32)
#X_test=pd.concat([X_test,holidays_test_df],axis=1)




In [None]:
X_test.head()

In [None]:
gbrt_ver1 = GradientBoostingRegressor(max_depth=10, n_estimators=150,learning_rate=0.1)
gbrt_ver1.fit(X_train, residual_y_train)
residual_y_val_pred = gbrt_ver1.predict(X_val)
errors = [mean_squared_error(residual_y_val, residual_y_val_pred) for residual_y_val_pred in gbrt_ver1.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)
gbrt_best = GradientBoostingRegressor(max_depth=10,n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, residual_y_train)

In [None]:
plt.plot(errors,'r-', label='Error change with n_estimators')
plt.grid()

In [None]:
gbrt_best.predict(X)
train_y_pred = gbrt_best.predict(X)
test_y_pred = gbrt_best.predict(X_test)
val_y_pred = gbrt_best.predict(X_val)
train['residual_for_ver1'] = train_y_pred


In [None]:
score_ver5 = SMAPE(y_val, lin_reg_pred_X_val+val_y_pred)
print('previous_score-',score_ver4,'new_score-',score_ver5,'improvement-',-(score_ver5-score_ver4)/score_ver4*100,'%')

In [None]:
'''
output = pd.DataFrame()
output['row_id'] = test.row_id
output['num_sold'] = test.linear_model_ver4+test_y_pred
output.to_csv('/kaggle/working/submission.csv',index=False)
'''

In [None]:

ax = plt.hist(train['num_sold']-train['linear_model_ver4']-train_y_pred,bins=[-200,-100,-75,-50,-25,0,25,50,75,100,200])


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
xgboost_reg = XGBRegressor()
parameters_grid = [{'max_depth':[12,10,15], 'n_estimators':[100], 'learning_rate':[0.15,0.2,0.25]}]
xboost_grid_search = GridSearchCV(xgboost_reg,parameters_grid,cv=3)
xboost_grid_search.fit(X_train, np.array(residual_y_train))


In [None]:
xgb = xboost_grid_search.best_estimator_
xgb

In [None]:
xgb.predict(X)
train_y_pred = xgb.predict(X)
test_y_pred = xgb.predict(X_test)
val_y_pred = xgb.predict(X_val)
train['residual_for_ver2'] = train_y_pred

In [None]:
score_ver6 = SMAPE(y_val, lin_reg_pred_X_val+val_y_pred)
print('previous_score-',score_ver5,'new_score-',score_ver6,'improvement-',-(score_ver5-score_ver4)/score_ver4*100,'%')

In [None]:
ax = plt.hist(train['num_sold']-train['linear_model_ver4']-train_y_pred,bins=[-200,-100,-75,-50,-25,0,25,50,75,100,200])

In [None]:
'''
output = pd.DataFrame()
output['row_id'] = test.row_id
output['num_sold'] = test.linear_model_ver4+test_y_pred
output.to_csv('/kaggle/working/submission.csv',index=False)
'''

In [None]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import time
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, TimeSeriesSplit
class HybridModel:
    def __init__(self, model_1, model_2, grid=None):
        self.model_1 = model_1
        self.model_2 = model_2
        self.grid=grid
        
    def fit(self, X_train_1, X_train_2, y):
        # Train model 1
        self.model_1.fit(X_train_1, y)
        
        # Predictions from model 1 (trend)
        y_trend = self.model_1.predict(X_train_1)

        if self.grid:
            # Grid search
            tscv = TimeSeriesSplit(n_splits=3)
            grid_model = GridSearchCV(estimator=self.model_2, cv=tscv, param_grid=self.grid)
        
            # Train model 2 on detrended series
            grid_model.fit(X_train_2, y-y_trend)
            
            # Model 2 preditions (for residual analysis)
            y_resid = grid_model.predict(X_train_2)
            
            # Save model
            self.grid_model=grid_model
        else:
            # Train model 2 on residuals
            self.model_2.fit(X_train_2, y-y_trend)
            
            # Model 2 preditions (for residual analysis)
            y_resid = self.model_2.predict(X_train_2)
        
        # Save data
        self.y_train_trend = y_trend
        self.y_train_resid = y_resid
        
    def predict(self, X_test_1, X_test_2):
        # Predict trend using model 1
        y_trend = self.model_1.predict(X_test_1)
        
        if self.grid:
            # Grid model predictions
            y_resid = self.grid_model.predict(X_test_2)
        else:
            # Model 2 predictions
            y_resid = self.model_2.predict(X_test_2)
        
        # Add predictions together
        y_pred = y_trend + y_resid
        
        # Save data
        self.y_test_trend = y_trend
        self.y_test_resid = y_resid
        
        return y_pred

In [None]:
X_test = X_Season.loc['2019-01-01':]
X = X_Season.loc[:'2018-12-31']
X_train_1=pd.concat([X,holidays_train_df],axis=1)
X_test_1=pd.concat([X_test,holidays_test_df],axis=1)

## boosting
X_test_2 = X_boosting.loc['2019-01-01':]
X_test_2=pd.concat([X_test_2,holidays_test_df],axis=1)
X_train_2 = X_boosting.loc[:'2018-12-31']
X_train_2=pd.concat([X_train_2,holidays_train_df],axis=1)


In [None]:
X_train_1.head()

In [None]:
model_1=LinearRegression()
models_2=[LGBMRegressor(random_state=0), CatBoostRegressor(random_state=0, verbose=False), XGBRegressor(random_state=0)]

# Parameter grid
param_grid = {'n_estimators': [10,30,50,75,100,200],
        'max_depth': [2,3,4, 5,10,20],
        'learning_rate': [0.04,0.06,0.08,0.09,0.15]}

# Initialise output vectors
y_pred=np.zeros(len(X_test_1))
train_preds=np.zeros(len(y))

# Ensemble predictions
for model_2 in models_2:
    # Start timer
    start = time.time()
    
    # Construct hybrid model
    model = HybridModel(model_1, model_2, grid=param_grid)

    # Train model
    model.fit(X_train_1, X_train_2, np.log(y))

    # Save predictions
    y_pred += np.exp(model.predict(X_test_1,X_test_2))
    
    # Training set predictions (for residual analysis)
    train_preds += np.exp(model.y_train_trend+model.y_train_resid)
    
    # Stop timer
    stop = time.time()
    
    print(f'Model_2:{model_2} -- time:{round((stop-start)/60,2)} mins')
    
    if model.grid:
        print('Best parameters:',model.grid_model.best_params_,'\n')
    
# Scale
y_pred = y_pred/len(models_2)
train_preds = train_preds/len(models_2)

In [None]:
score_ver7 = SMAPE(y, train_preds)
score_ver7

In [None]:
print((train['num_sold']-train_preds).mean())
(train['num_sold']-train_preds).std()

In [None]:
print((train['num_sold']-train_preds).mean())
(train['num_sold']-train_preds).std()

In [None]:

print((train['num_sold']-train['linear_model_ver4']-train_y_pred).mean())
(train['num_sold']-train['linear_model_ver4']-train_y_pred).std()

In [None]:
ax = plt.hist(train['num_sold']-train_preds,bins=[-200,-100,-75,-50,-25,0,25,50,75,100,200])

In [None]:
ax = plt.hist(train['num_sold']-train_preds,bins=[-200,-100,-75,-50,-25,0,25,50,75,100,200])

In [None]:
train['ensemble_ver_1'] = train_preds
grouped = train.groupby(['country','store','product'])
ncols=1
nrows = int(np.ceil(grouped.ngroups/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20,50), sharey=True)
ax_list = axes.flatten()
print(axes.flatten)
cnt = 0
for key, grp in grouped:
    ax = ax_list[cnt]
    ax.plot(grp['row_id'], grp['num_sold']-grp['ensemble_ver_1'], label='pred')
    ax.plot(grp['row_id'], grp['num_sold']-grp['linear_model_ver4']-grp['residual_for_ver1'], label='pred gb')
    #ax.plot(grp['row_id'], grp['num_sold'], label='true')
    
    

    ax.legend()
 
    cnt = cnt+1

In [None]:
output = pd.DataFrame()
output['row_id'] = test.row_id
output['num_sold'] = y_pred
output.to_csv('/kaggle/working/submission.csv',index=False)