In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from sklearn.metrics import mean_squared_error
import seaborn as sns
import warnings
from sklearn.linear_model import Ridge
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This is my first entry into a TPS competition. From my experience with time series data ,and from what I gathered by going through the discussion forum and public noteboks, I could make some key points:
1. GDP data of the countries should help capture the year-on-year increasing trend in the data
2. Decision trees (XGBoost, LGBM etc) alone are not likely to give sound predictions. Atleast not without some ingenious feature engineering. This is because tree based models do not capture macro trends and are consequently poor at extrapolation. In our case,for instance, a tree model will branch out based on some condition GDP>X and for will treat all data instances satisfying that condition similarly. Hence, when I solely used a LGB Regressor, my test predictions and validation predictions(2018 data) were same.
3. Linear models are likely to perform better at capturing this macro trend of yearly increment in sales.
4. A custom objective function that penalizes underpredictions more than overpredictions should help given that the evaluation metric(SMAPE) behaves similarly. Refer [this discussion](http://https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/300611) to understand more.

In [None]:
train=pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv') 
test=pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
hol=pd.read_csv('../input/holidays/holidays.csv') #List of holidays in Nordic countries
gdp=pd.read_csv('../input/gdp-nordic1/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv',index_col='year')

In [None]:
#Preprocessing the dataframes
train['product']=train['product'].str.split(' ').str[0]+'_'+train['product'].str.split(' ').str[1]
test['product']=test['product'].str.split(' ').str[0]+'_'+test['product'].str.split(' ').str[1]
hol.rename(columns={'Date':'date','Country':'country'},inplace=True)
hol['date']=pd.to_datetime(hol['date'])
train['date']=pd.to_datetime(train['date'])
test['date']=pd.to_datetime(test['date'])
train=pd.merge(train,hol[['country','Name','date']],on=('country','date'),how='left')
test=pd.merge(test,hol[['country','Name','date']],on=('country','date'),how='left')
gdp.rename(columns={'GDP_Finland':'Finland','GDP_Norway':'Norway','GDP_Sweden':'Sweden'},inplace=True)

In [None]:
#Extracting necessary information from date field
def date_extraction(train,gdp):
    train['weekday']=train['date'].dt.weekday
    train['month']=train['date'].dt.month
    train['year']=train['date'].dt.year
    train['quarter']=train['date'].dt.quarter
    train['day_of_year']=train['date'].dt.dayofyear
    train['week_of_year']=train['date'].dt.isocalendar().week.astype(int)
    #train['is_weekend']=np.where(train['weekday']>3,1,0)
    train['day']=train['date'].dt.day
    train['week_of_mon']=np.round(train['day']/7)+1
    a=gdp.unstack().to_dict()
    train['gdp']=train.set_index(['country','year']).index.map(a.get)
    #train=pd.merge(train,gdp[['year','country','gdp','quarter']],on=('year','quarter','country'),how='left')
    return train
train=date_extraction(train,gdp)
test=date_extraction(test,gdp)

In [None]:
f,ax=plt.subplots(1,2,figsize=(25,8))
ax[0].hist(train['num_sold'])
ax[0].set_title('num_sold')
ax[1].hist(np.log(train['num_sold']))
ax[1].set_title('log(num_sold)')
plt.show()

The log transformation of num_sold reasonably approximates a standard normal distribution. Linear models assume normality, so it might be better to predict the log transformation of the variable. Besides, doing so also conforms with the SMAPE loss function.
Check this [discussion](http://https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/300611) to know more about that.

# BASIC EDA

In [None]:
def country_week(train,prod):
    f,ax=plt.subplots(2,3,figsize=(20,10))
    for i,s in enumerate(train['store'].unique()):
        for j,con in enumerate(train['country'].unique()):
            for y in [2015,2016,2017,2018]:
                l=[]
                db=train.groupby(['country','store','product']).get_group((con,s,prod))
                x=np.arange(1,6)
                for w in range(1,6):
                    l.append(db.groupby(['week_of_mon','year']).get_group((w,y))['num_sold'].mean())
                ax[i][j].plot(x,l,label=y)
                ax[i][j].legend(loc='upper right')
            ax[i][j].set_title(con+' '+s)
            f.suptitle(prod+' '+'weekly average',size=30)

Before moving to modeling, I performed some elementary EDA to extract some micro trends. Let's start with averaging the sales value for weeks(1-5) of every month. So, if the average for week 1 in 2015 is X, then in any given month, on average, the product was sold X times in week 1.

In [None]:
country_week(train,'Kaggle_Hat')
country_week(train,'Kaggle_Mug')
country_week(train,'Kaggle_Sticker')

Here, I have defined week 1 as the first 7 days of the month and so on. Week 5 thus consists of only 2-3 days.

Observations:

1. Product sale trend is country dependent. The variation between products among different stores seem minor and erratic. 
2. On average,sales increase significantly towards the end of the month (week 5,last 2-3 days)
3. I'll create interaction features between week and country
4. Sales reduced in 2016 in Norway. Can be attributed to drop in GDP

In [None]:
def country_mon(train,prod):
    f,ax=plt.subplots(2,3,figsize=(20,10))
    for i,s in enumerate(train['store'].unique()):
        for j,con in enumerate(train['country'].unique()):
            for y in [2015,2016,2017,2018]:
                l=[]
                db=train.groupby(['country','store','product']).get_group((con,s,prod))
                x=np.arange(1,13)
                for m in range(1,13):
                    l.append(db.groupby(['month','year']).get_group((m,y))['num_sold'].mean())
                ax[i][j].plot(x,l,label=y)
                ax[i][j].legend(loc='upper right')
            ax[i][j].set_title(con+' '+s)
        f.suptitle(prod+' '+'monthly average',size=30)

In [None]:
country_mon(train,'Kaggle_Hat')
country_mon(train,'Kaggle_Mug')
country_mon(train,'Kaggle_Sticker')

Observations:
1. Sales increase during the last 2-3 months for all the products, but otherwise the trends differ.
2. Mug:
    2.1 Sales drop consistently till month 7 before beginning to rise again
    2.2 There's an increase, however, in month 4 which might be associated with Easter
3. Hat:
    3.1 Sales increase consistently from month 2-4 following which there's a drop till month 10
4. Stikcer:
    4.1 Sales increase generally from month 2-5, but there's variation between countries
    4.2 In Norway, the sales peak in month 5. However, Sweden and Finland witness a second peak in month 6. Again this too varies
        across the 4 years
5. An interaction term between month product and country should capture these trends. For now, I'll avoid adding store to this interaction term, as the variation seems minor

In [None]:
def country_mons(train,prod):
    f,ax=plt.subplots(3,2,figsize=(20,10))
    for i,con in enumerate(train['country'].unique()):
        for j,m in enumerate([5,6]):
            for y in [2015,2016,2017,2018]:
                l=[]
                db=train.groupby(['month','product','country']).get_group((m,prod,con))
                x=np.arange(1,db['day'].max()+1)
                for d in range(1,db['day'].max()+1):
                    l.append(db.groupby(['day','year']).get_group((d,y))['num_sold'].mean())
                ax[i][j].plot(x,l,label=y)
                ax[i][j].legend(loc='upper left')
            ax[i][j].set_title(con+' '+'month'+' '+str(m))
    f.suptitle(prod+' '+'daily average for April and December',size=20)

In [None]:
country_mons(train,'Kaggle_Hat')
country_mons(train,'Kaggle_Mug')
country_mons(train,'Kaggle_Sticker')

Observations:
1. Contrary to expectations the peak of sales does not coincide with the holidays (Easter in April, and Chirstmas,NYE in December)
2. Sales in the last week of December surge after Christmas and then begin to decline around New year's eve. 30th December is generally the peak
3. In April, the sales peak in the week post Easter(7-8 days from Easter). 

I'll add features to address these trends

In [None]:
def country_weekday(train,prod):
    f,ax=plt.subplots(2,3,figsize=(20,10))
    for i,s in enumerate(train['store'].unique()):
        for j,con in enumerate(train['country'].unique()):
            for y in [2015,2016,2017,2018]:
                l=[]
                db=train.groupby(['store','product','country']).get_group((s,prod,con))
                x=np.arange(1,8)
                for m in range(0,7):
                    l.append(db.groupby(['weekday','year']).get_group((m,y))['num_sold'].mean())
                ax[i][j].plot(x,l,label=y)
                ax[i][j].legend(loc='lower right')
            ax[i][j].set_title(con+' '+s)
    f.suptitle(prod+' '+'weekday average',size=20)

In [None]:
country_weekday(train,'Kaggle_Hat')
country_weekday(train,'Kaggle_Mug')
country_weekday(train,'Kaggle_Sticker')

1. Sales increase significantly from Thusday to Saturday and then remain constant till Sunday. Ergo, I'll keep weekday as a tertiary variable: 0 (Mon-Thu) 1(Fri) 2(Sat-Sun). Perhaps, Sunday can be considered separately as well.
2. The trend between Saturday and Sunday is non-uniform
3. An interaction feature between weekday,country,product and store might help

In [None]:
def country_day_of_mon(train,prod):
    f,ax=plt.subplots(2,3,figsize=(20,10))
    for i,s in enumerate(train['store'].unique()):
        for j,con in enumerate(train['country'].unique()):
            for y in [2015,2016,2017,2018]:
                l=[]
                db=train.groupby(['store','product','country']).get_group((s,prod,con))
                x=np.arange(1,32)
                for m in range(1,32):
                    l.append(db.groupby(['day','year']).get_group((m,y))['num_sold'].mean())
                ax[i][j].plot(x,l,label=y)
                ax[i][j].legend(loc='upper right')
            ax[i][j].set_title(con+' '+s)
        f.suptitle(prod+' '+'day wise average',size=20)

In [None]:
country_day_of_mon(train,'Kaggle_Hat')
country_day_of_mon(train,'Kaggle_Mug')
country_day_of_mon(train,'Kaggle_Sticker')

Nothing new here. Sales increase sharply towards the last 2-3 days of any month. I belive the 'week_no' feature discussed above should capture this trend. Otherwise, I'll include a day of the month (1-31) feature and see the performace.

**HOLIDAYS**

I will now try to gauge the impact of holidays on sales. How are sales affected on a holiday, the day before a holiday, and the day after a holiday.

In [None]:
hol['is_special']=0
for con in ['Sweden','Norway','Finland']:
    db=train.groupby('country').get_group(con)
    db=db.fillna('NO')
    a={}
    b=[]
    for n in db['Name'].unique():
        if n!='NO':
            a[n]=db[db['Name']==n]['num_sold'].mean()
            b.append(db[db['Name']==n]['date'].unique())
    m=db[~db['date'].isin(b)]['num_sold'].mean()
    for n in a.keys():
        if a[n]>m:
            hol['is_special']=np.where((hol['Name']==n)&(hol['country']==con),1,hol['is_special'])

Assuming that certian holidays have more impact on sales, I have created a is_special feature to identify holidays with average sales over the four years more than the four years average sales of the non-holiday days.

In [None]:
def holiday_feat(train,hol):
    c=-1
    l=['is_prev_holiday','is_holiday','is_foll_holiday']
    for i,j in enumerate(l):
        hol[j]=1
        hol['temp']=hol['date']
        train['temp']=train['date']+pd.Timedelta(c,unit='D')
        train=pd.merge(train,hol[['temp','country',j]],how='left',on=('country','temp'))
        c+=1
        train[j]=np.where(train['temp']==pd.to_datetime('2014-12-31'),1,train[j])
        train[j]=train[j].fillna(0)
    train.drop(columns=['temp'],inplace=True)
    hol.drop(columns=['temp'],inplace=True)
    hol.drop(columns=l,inplace=True)
    train=pd.merge(train,hol[['country','is_special','Name','date']],on=('country','Name','date'),how='left')
    train['is_special']=train['is_special'].fillna(0)
    train['Name']=train['Name'].fillna('NONE')
    return train
train=holiday_feat(train,hol)
test=holiday_feat(test,hol)

Three more features to identify whether a day is a holiday and to mark the days preceding and following a holiday. 

In [None]:
def hol_dist(df,con,y):
    f,ax=plt.subplots(2,3,figsize=(20,10))
    for i,s in enumerate(train['store'].unique()):
        for j,p in enumerate(train['product'].unique()):
            d1=df.groupby(['country','store','product','year']).get_group((con,s,p,y))
            e=d1[d1['is_holiday']==1]['num_sold']
            b=d1[d1['is_prev_holiday']==1]['num_sold']
            c=d1[d1['is_foll_holiday']==1]['num_sold']
            d=d1[(d1['is_prev_holiday']==0)&(d1['is_foll_holiday']==0)&(d1['is_holiday']==0)]['num_sold']
            sns.distplot(ax=ax[i][j],a=e, hist=False, kde=True,color = 'darkblue',label='is_holiday')
            sns.distplot(ax=ax[i][j],a=b, hist=False, kde=True,color = 'orange',label='is_prev_holiday')
            sns.distplot(ax=ax[i][j],a=c, hist=False, kde=True,color = 'red',label='is_foll_holiday')
            sns.distplot(ax=ax[i][j],a=d, hist=False, kde=True,color = 'green',label='none')
            ax[i][j].legend(loc='upper right')
            ax[i][j].set_title(s+' '+p)
        f.suptitle(con+' '+'holiday num_sold distributions for year'+' '+str(y),size=20)

In [None]:
hol_dist(train,'Finland',2015)
hol_dist(train,'Finland',2016)
hol_dist(train,'Finland',2017)
hol_dist(train,'Finland',2018)

Observation:

For some reason, the average sales on days following a holiday are generally higher than in other cases. I'll include two of these variables (is_prev_holiday and is_holiday in the model). Both are binary features.

In [None]:
def final_engineering(d,hol):
    df=d.copy()
    #df['hat_spring']=0
    df['is_holiday']=np.where((df['day']==24)&(df['month']==12),1,df['is_holiday'])
    df['is_prev_holiday']=np.where((df['day']==25)&(df['month']==12),1,df['is_prev_holiday'])
    df['is_holiday']=np.where((df['day']==31)&(df['month']==12),1,df['is_holiday'])
    df['is_prev_holiday']=np.where((df['day']==1)&(df['month']==1),1,df['is_prev_holiday'])
    df['is_special']=np.where((df['day']==24)&(df['month']==12),1,df['is_special'])
    df['is_holiday']=np.where((df['day']==31)&(df['month']==12),1,df['is_special'])
    #df['hat_spring']=np.where((df['month']==3)&(df['product']=='Kaggle_Hat'),1,df['hat_spring'])
    #df['sticker_may']=np.where((df['month']==5)&(df['product']=='Kaggle_Sticker'),1,0)
    df['week_day']=0
    df['week_day']=np.where(df['weekday']==4,1,df['week_day'])
    df['week_day']=np.where(df['weekday']==5,2,df['week_day'])
    df['week_day']=np.where(df['weekday']==6,2,df['week_day'])
    for i in range(26,31):
        df['is_last_week']=np.where((df['day']==i)&(df['month']==12),1,0)
        df['is_special']=np.where((df['day']==i)&(df['month']==12),4,df['is_special'])
    df.drop(columns=['weekday'],inplace=True)
    df['easter_eff']=0
    for col in ['month','week_day','week_of_mon','day','day_of_year']:
        df['cos'+'_'+col]=np.cos(2*np.pi*df[col]/max(df[col]))
        df['sin'+'_'+col]=np.sin(2*np.pi*df[col]/max(df[col]))
        df.drop(columns=[col],inplace=True)
    spec=hol[hol['Name'].str.contains('Easter')]
    l=[]
    for d in spec['date']:
        for i in range(2,8):
            e=d+pd.Timedelta(i,unit='D')
            df['easter_eff']=np.where(df['date']==e,1,df['easter_eff'])
    for d in ['Easter','New','Christmas']:
        df['is_special']=np.where(df['Name'].str.contains(d),2,df['is_special'])
    df['is_special']=df['is_special']+df['is_holiday']
    import dateutil.easter as easter

    easter_timestamp = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df['days_from_easter'] = (df.date - easter_timestamp).dt.days.clip(-3, 59)
    df.loc[df['days_from_easter'].isin(range(12, 39)), 'days_from_easter'] = 12 # reduce overfitting
    
    # Last Wednesday of June
    wed_june_timestamp = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    df['days_from_wed_jun'] = (df.date - wed_june_timestamp).dt.days.clip(-5, 5)
    
    # First Sunday of November (second Sunday is Father's Day)
    sun_nov_timestamp = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df['days_from_sun_nov'] = (df.date - sun_nov_timestamp).dt.days.clip(-1, 9)
    
    return df

Final feature engineeing includes:
1. Converting ordinal date features to cyclic date features
2. Including a last_week feature to capture the surge in sales during the last week of the year
3. An easter effect feature for the sales peak in the weak following Easter
4. Combining is_holiday and is_special into a single feature

In [None]:
train=final_engineering(train,hol)
test=final_engineering(test,hol)

In [None]:
def interaction(df):
    df['gdp']=np.log(df['gdp'])
    #for c in ['Sweden','Norway']:
     #   for p in ['Kaggle_Mug','Kaggle_Sticker']:
      #      for func in ['cos','sin']:
       #         df[f'{c}_{p}_week_{func}']=(df.country==con)*(df['product']==p)*(df[f'{func}_week_of_mon'])
    for c in ['Sweden','Norway']:
        for p in ['Kaggle_Mug','Kaggle_Sticker']:
            for func in ['cos','sin']:
                df[f'{c}_{p}_month_{func}']=(df.country==con)*(df['product']==p)*(df[f'{func}_month'])
    df['cos_week*mon']=df['cos_week_of_mon']*df['cos_month']
    df['sin_week*mon']=df['sin_week_of_mon']*df['sin_month']
    return df

In [None]:
train=interaction(train)
test=interaction(test)
reg_train=train.copy()

In [None]:
def get_dummy(df):
    for c in ['country','store','product']:
        a=pd.get_dummies(df[c],drop_first=True)
        df=pd.concat([df,a],axis=1)
        df.drop(columns=[c],inplace=True)
    return df
train=get_dummy(train)
test=get_dummy(test)

# MODELING

I'll start with a linear model without the holiday features to capture the macro trend. I'll then train a tree model on the residuals of the linear model.

In [None]:
def gdp_regression(data,t,p,con,st,TEST,al,show=True):
    #df=data.groupby(['country','store','product']).get_group((con,st,p))
    df=data.copy()
    te=TEST.copy()
    #'Finland','Sweden','Norway','KaggleRama','KaggleMart','Kaggle_Mug','Kaggle_Hat','Kaggle_Sticker'
    temp=t[t['date']>=pd.to_datetime('2018-01-01')]
    x_tr=df[df['date']<pd.to_datetime('2018-01-01')][['cos_month', 'sin_month', 'cos_week_day', 'sin_week_day',
       'cos_week_of_mon', 'sin_week_of_mon','Sweden_Kaggle_Mug_month_cos',
       'Sweden_Kaggle_Mug_month_sin', 'Sweden_Kaggle_Sticker_month_cos',
       'Sweden_Kaggle_Sticker_month_sin', 'Norway_Kaggle_Mug_month_cos',
       'Norway_Kaggle_Mug_month_sin', 'Norway_Kaggle_Sticker_month_cos',
       'Norway_Kaggle_Sticker_month_sin', 'cos_week*mon', 'sin_week*mon',
       'Norway', 'Sweden', 'KaggleRama', 'Kaggle_Mug', 'Kaggle_Sticker','gdp','cos_day','sin_day']]
    x_ts=df[df['date']>=pd.to_datetime('2018-01-01')][['cos_month', 'sin_month', 'cos_week_day', 'sin_week_day',
       'cos_week_of_mon', 'sin_week_of_mon','Sweden_Kaggle_Mug_month_cos',
       'Sweden_Kaggle_Mug_month_sin', 'Sweden_Kaggle_Sticker_month_cos',
       'Sweden_Kaggle_Sticker_month_sin', 'Norway_Kaggle_Mug_month_cos',
       'Norway_Kaggle_Mug_month_sin', 'Norway_Kaggle_Sticker_month_cos',
       'Norway_Kaggle_Sticker_month_sin', 'cos_week*mon', 'sin_week*mon',
       'Norway', 'Sweden', 'KaggleRama', 'Kaggle_Mug', 'Kaggle_Sticker','gdp','cos_day','sin_day']]
    y_tr=np.log(df[df['date']<pd.to_datetime('2018-01-01')][['num_sold']])
    y_ts=np.log(df[df['date']>=pd.to_datetime('2018-01-01')][['num_sold']])
    
    for c in ['gdp','cos_month', 'sin_month', 'cos_week_day', 'sin_week_day',
       'cos_week_of_mon', 'sin_week_of_mon','Sweden_Kaggle_Mug_month_cos',
       'Sweden_Kaggle_Mug_month_sin', 'Sweden_Kaggle_Sticker_month_cos',
       'Sweden_Kaggle_Sticker_month_sin', 'Norway_Kaggle_Mug_month_cos',
       'Norway_Kaggle_Mug_month_sin', 'Norway_Kaggle_Sticker_month_cos',
       'Norway_Kaggle_Sticker_month_sin', 'cos_week*mon', 'sin_week*mon','cos_day','sin_day']:
        ma=x_tr[c].max()
        mi=x_tr[c].min()
        x_tr[c]=((x_tr[c]-mi)/(ma-mi))+1
        x_ts[c]=((x_ts[c]-mi)/(ma-mi))+1
        te[c]=((te[c]-mi)/(ma-mi))+1
        
    #x_tr=s.fit_transform(x_tr)
    #x_ts=s.transform(x_ts)
    reg=Ridge(alpha=al)
    reg.fit(x_tr,y_tr)
    preds=reg.predict(x_ts)
    temp['preds']=preds
    tr=reg.predict(x_tr)
    te['num_sold']=reg.predict(te[['cos_month', 'sin_month', 'cos_week_day', 'sin_week_day',
       'cos_week_of_mon', 'sin_week_of_mon','Sweden_Kaggle_Mug_month_cos',
       'Sweden_Kaggle_Mug_month_sin', 'Sweden_Kaggle_Sticker_month_cos',
       'Sweden_Kaggle_Sticker_month_sin', 'Norway_Kaggle_Mug_month_cos',
       'Norway_Kaggle_Mug_month_sin', 'Norway_Kaggle_Sticker_month_cos',
       'Norway_Kaggle_Sticker_month_sin', 'cos_week*mon', 'sin_week*mon',
       'Norway', 'Sweden', 'KaggleRama', 'Kaggle_Mug', 'Kaggle_Sticker','gdp','cos_day','sin_day']])
    if show==True:
        #plt.subplots(1,1,figsize=(20,5))
        #plt.plot(y_ts.reset_index(drop=True))
        #plt.plot(preds)
        #plt.show()
        f,ax=plt.subplots(2,3,figsize=(20,10))
        for i,s in enumerate(temp['store'].unique()):
            for j,p in enumerate(temp['product'].unique()):
                a=np.log(temp.groupby(['country','store','product']).get_group((con,st,p))['num_sold'].values)
                b=temp.groupby(['country','store','product']).get_group((con,st,p))['preds'].values
                ax[i][j].plot(a,label='True')
                ax[i][j].plot(b,label='Predicted')
                ax[i][j].legend(loc='upper right')
                ax[i][j].set_title(s+' '+p)
    df['res']=np.log(df['num_sold'])-np.append(reg.predict(x_tr),reg.predict(x_ts))
    sc=mean_squared_error(y_ts,preds)
    return reg.coef_,df,te,preds,tr,sc

In [None]:
gdp_slope,df,te,va,tr,sc=gdp_regression(train,reg_train,'Kaggle_Mug','Norway','KaggleRama',test,1,show=True)

In [None]:
for i in range(1,20):
    gdp_slope,df,te,va,tr,sc=gdp_regression(train,reg_train,'Kaggle_Mug','Norway','KaggleRama',test,i/10,show=False)
    print(i/10,sc)

The residuals, I believe, correspond to holidays and other seasonality factors.

In [None]:
train['res']=df['res']

In [None]:
X=train[train['date']<pd.to_datetime('2018-01-01')]
Y=train[train['date']>=pd.to_datetime('2018-01-01')]

In [None]:
#'cos_month','sin_month','cos_week_of_mon','sin_week_of_mon','day','sin_week_day','cos_week_day',
x_train=X[['cos_day','sin_day','cos_month','sin_month','cos_week_of_mon','sin_week_of_mon','sin_week_day','cos_week_day','is_special','Sweden','Norway','KaggleRama','Kaggle_Mug','Kaggle_Sticker','days_from_easter', 'days_from_wed_jun', 'days_from_sun_nov']]
x_test=Y[['cos_day','sin_day','cos_month','sin_month','cos_week_of_mon','sin_week_of_mon','sin_week_day','cos_week_day','is_special','Sweden','Norway','KaggleRama','Kaggle_Mug','Kaggle_Sticker','days_from_easter', 'days_from_wed_jun', 'days_from_sun_nov']]
y_test=Y[['res']]
y_train=X[['res']]

In [None]:
va=va.reshape(6570,) #Test predictions from the linear model
grid = {'n_estimators':150,
        'max_depth': 6,
        'learning_rate': 0.1}
model=LGBMRegressor(**grid)
#model=LGBMRegressor(lambda_l1=0, lambda_l2=40, learning_rate=0.2823128119188338,
             # max_depth=20, n_estimators=1000, num_leaves=27)
model.fit(x_train,y_train)
lgb_pred=model.predict(x_test)
pred=lgb_pred+va

In [None]:
plt.barh(x_train.columns,model.feature_importances_)

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    return np.mean(diff)

def objective(trial,x_train,y_train,x_test,y_test,Y,va):
    params = {
        'max_depth': trial.suggest_int('max_depth',3, 20),
        'num_leaves':trial.suggest_int('num_leaves',10,1500),
        'n_estimators':trial.suggest_int('n_estimators',150,1000,step=50),
        'learning_rate':trial.suggest_float('learning_rate',0.01,0.3),
        'lambda_l1':trial.suggest_int('lambda_l1',0,100,step=5),
        'lambda_l2':trial.suggest_int('lambda_l2',0,100,step=5)
    }
    model = LGBMRegressor(**params, 
                         random_state=42)
    model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=20, verbose=False)
    preds = model.predict(x_test)+va
    score = SMAPE(Y[['num_sold']].values.reshape(6570,), np.exp(preds))
    
    return score
study = optuna.create_study(direction='minimize')
func = lambda trial: objective(trial, x_train,y_train,x_test,y_test,Y,va)
study.optimize(func, n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best trial parameter:', study.best_trial.params)
model=LGBMRegressor(**study.best_trial.params)
model.fit(x_train,y_train)

In [None]:
lgb_pred=model.predict(x_test)
pred=lgb_pred+va

In [None]:
score=SMAPE(Y[['num_sold']].values.reshape(6570,),np.exp(pred))
score

In [None]:
#Training score
SMAPE(X[['num_sold']].values.reshape(19728,),np.exp(tr.reshape(19728,)+model.predict(x_train)))

In [None]:
test.set_index('row_id',drop=True,inplace=True)
t=te['num_sold'].values.reshape(6570,) #test set predictions fromt the linear model

In [None]:
preds_test = model.predict(test[['cos_day','sin_day','cos_month','sin_month','cos_week_of_mon','sin_week_of_mon','sin_week_day','cos_week_day','is_special','Sweden','Norway','KaggleRama','Kaggle_Mug','Kaggle_Sticker','days_from_easter', 'days_from_wed_jun', 'days_from_sun_nov']])
preds_test=np.ceil(np.exp(preds_test+t))
output = pd.DataFrame({'row_id': test.index,
                       'num_sold': preds_test})
output.to_csv('submission.csv',index=False)
output.head()