In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime 
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv", index_col = 'row_id')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# checking variance in dataset features
df.var()

In [None]:
#  checking skewness
df.skew()

## EDA

In [None]:
sns.histplot(df['num_sold'], kde = True)

In [None]:
# Checking outliers
sns.boxplot(df['num_sold'])

In [None]:
fig = plt.figure(figsize = (10,8))
sns.boxplot(y = df['num_sold'], x = df['country'], hue = df['store'] )

### Norway has the highest num sold for both the stores.
### Finland has the lowest sold numbers
### KaggleRama store has sold higher number of products in each country

In [None]:
df['product'].value_counts()

In [None]:
fig = plt.figure(figsize = (10,8))
sns.boxplot(y = df['num_sold'], x = df['country'], hue = df['product'] )

### Norway has the highest num sold for all the products.
### Finland has the lowest num sold
### Kaggle hat is sold most in all the countries.

In [None]:
fig = plt.figure(figsize = (10,5))
sns.boxplot(y = df['num_sold'], x = df['store'], hue = df['product'] )

### Kaggle hat is most sold in both the stores and Kaggle Sticker has the least selling numbers.

In [None]:
df['date'].dtype

In [None]:
# Date time conversion
df['date'] = pd.to_datetime(df['date'])

In [None]:
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['weekday'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['yearday'] = df['date'].dt.dayofyear

In [None]:
df.head(-10)

In [None]:
fig = plt.figure(figsize = (15,7))
sns.boxplot(y = df['num_sold'], x = df['weekday'])
plt.show()

### Increase in the selling numbers can be seen on weekends (Day 5 and Day 6)

In [None]:
fig = plt.figure(figsize = (18,8))
sns.boxplot(y = df['num_sold'], x = df['month'], hue = df['country'])
plt.show()

### January, April and December saw increase in selling numbers and kaggle hat was leading the way.
### Reason could be festive seasons and national holidays. 

In [None]:
# fig = plt.figure(figsize = (20,5))
# sns.lineplot(x = df['date'], y = df['num_sold'], hue = df['country'])
# plt.show()

In [None]:
df['num_sold'].groupby(df['country']).mean()

In [None]:
# fig = plt.figure(figsize = (20,5))
# sns.lineplot(x = df['date'], y = df['num_sold'], hue = df['store'])
# plt.show()

In [None]:
fig = plt.figure(figsize = (15,8))
sns.boxplot(y = df['num_sold'], x = df['year'], hue = df['country'])
plt.show()

### Slow increase in selling no can be seen year after year.

## Data Cleaning and feature engineering

In [None]:
df['num_sold'].var()

In [None]:
# Variation too high, Log transformation on num_sold
# df['num_sold'] = np.log1p(df['num_sold'])

In [None]:
from scipy.stats import boxcox
out = boxcox(df['num_sold'])
df['num_sold'] = out[0]
lam = out[1]

In [None]:
sns.histplot(df['num_sold'], kde = True)

In [None]:
# Outliers have been removed and var, skew under limits
sns.boxplot(df['num_sold'])
print(df['num_sold'].var())
df['num_sold'].skew()

In [None]:
df.head()

In [None]:
# Using Holidays data 
holiday = pd.read_csv("../input/public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv")
holiday

In [None]:
df['holiday'] = 0
fin_holiday = holiday.loc[holiday.country == 'Finland']
swe_holiday = holiday.loc[holiday.country == 'Sweden']
nor_holiday = holiday.loc[holiday.country == 'Norway']
df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)

In [None]:
df.head()

In [None]:
df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']

In [None]:
# df[(df['holiday']==1) | (df['weekday'] == 5) | (df['weekday'] == 6)]['holiday'] = 1

In [None]:
df.head()

In [None]:
# Using Per capita GDP as well 
gdpc = pd.read_csv("../input/gdp-per-capita-finland-norway-sweden-201519/GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv")
gdpc

In [None]:
# Joining the data
df['gdpc']=0
for i in range(df['year'].shape[0]):
    df['gdpc'].iloc[i] = gdpc[gdpc['year'] == df['year'].iloc[i]][df['country'].iloc[i]]

In [None]:
df.head()

In [None]:
# Checking seasonality using FFT

from scipy.fftpack import fft
import numpy as np
import matplotlib.pyplot as plt
fft = fft((df['num_sold'] - df['num_sold'].mean()).values)
plt.plot(np.abs(fft))

In [None]:
df.columns

In [None]:
import math
def fourier(df):
    dayofyear = df.date.dt.dayofyear
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        df[product] = df['product'] == product
    
    # The three products have different seasonal patterns
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 2):
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        df[f'mug_sin{k}'] = df[f'sin{k}'] * df['Kaggle Mug']
        df[f'mug_cos{k}'] = df[f'cos{k}'] * df['Kaggle Mug']
        df[f'hat_sin{k}'] = df[f'sin{k}'] * df['Kaggle Hat']
        df[f'hat_cos{k}'] = df[f'cos{k}'] * df['Kaggle Hat']
        df=df.drop([f'sin{k}', f'cos{k}'], axis=1)
    
    # drop temporary one hot encoding
    df=df.drop(['Kaggle Mug', 'Kaggle Hat'], axis=1)
    
    return df
df = fourier(df)

In [None]:
def get_interactions(df):
    df['KR_Sweden_Mug']=(df.country=='Sweden')*(df['product']=='Kaggle Mug')*(df.store=='KaggleRama')
    df['KR_Sweden_Hat']=(df.country=='Sweden')*(df['product']=='Kaggle Hat')*(df.store=='KaggleRama')
    df['KR_Sweden_Sticker']=(df.country=='Sweden')*(df['product']=='Kaggle Sticker')*(df.store=='KaggleRama')
    df['KR_Norway_Mug']=(df.country=='Norway')*(df['product']=='Kaggle Mug')*(df.store=='KaggleRama')
    df['KR_Norway_Hat']=(df.country=='Norway')*(df['product']=='Kaggle Hat')*(df.store=='KaggleRama')
    df['KR_Norway_Sticker']=(df.country=='Norway')*(df['product']=='Kaggle Sticker')*(df.store=='KaggleRama')
    df['KR_Finland_Mug']=(df.country=='Finland')*(df['product']=='Kaggle Mug')*(df.store=='KaggleRama')
    df['KR_Finland_Hat']=(df.country=='Finland')*(df['product']=='Kaggle Hat')*(df.store=='KaggleRama')
    df['KR_Finland_Sticker']=(df.country=='Finland')*(df['product']=='Kaggle Sticker')*(df.store=='KaggleRama')
    
    df['KM_Sweden_Mug']=(df.country=='Sweden')*(df['product']=='Kaggle Mug')*(df.store=='KaggleMart')
    df['KM_Sweden_Hat']=(df.country=='Sweden')*(df['product']=='Kaggle Hat')*(df.store=='KaggleMart')
    df['KM_Sweden_Sticker']=(df.country=='Sweden')*(df['product']=='Kaggle Sticker')*(df.store=='KaggleMart')
    df['KM_Norway_Mug']=(df.country=='Norway')*(df['product']=='Kaggle Mug')*(df.store=='KaggleMart')
    df['KM_Norway_Hat']=(df.country=='Norway')*(df['product']=='Kaggle Hat')*(df.store=='KaggleMart')
    df['KM_Norway_Sticker']=(df.country=='Norway')*(df['product']=='Kaggle Sticker')*(df.store=='KaggleMart')
    df['KM_Finland_Mug']=(df.country=='Finland')*(df['product']=='Kaggle Mug')*(df.store=='KaggleMart')
    df['KM_Finland_Hat']=(df.country=='Finland')*(df['product']=='Kaggle Hat')*(df.store=='KaggleMart')
    df['KM_Finland_Sticker']=(df.country=='Finland')*(df['product']=='Kaggle Sticker')*(df.store=='KaggleMart')
    
    return df
df = get_interactions(df)

In [None]:
# Dropping cols
df.drop(['date'], axis = 1, inplace = True)
df.head()

In [None]:
# One hot encoding
col = ['country', 'store', 'product']
df = pd.get_dummies(df, columns = col, drop_first = True)
df.head()

In [None]:
# Checking correlations
sns.heatmap(df[['num_sold', 'gdpc']].corr(), annot = True)

In [None]:
# scaling numeric features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df[['year','gdpc']] = sc.fit_transform(df[['year','gdpc']])

In [None]:
y = df.pop('num_sold')
X = df

In [None]:
X.head()

In [None]:
X.columns

In [None]:
y.head()

## Modeling 

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor

In [None]:
# from sklearn.model_selection import TimeSeriesSplit

# folds = TimeSeriesSplit(n_splits=5)


In [None]:
# # Parameter grid
# grid = {'iterations': [2000],
#         'learning_rate': [0.05]
        
#         }

# # LGBMRegressor model
# model = CatBoostRegressor(  bootstrap_type='Bayesian',
#                             boosting_type='Plain',
#                             loss_function='MAE',
#                             eval_metric='SMAPE',
#                             l2_leaf_reg = 5,

#                           verbose = 1000
# #                           plot = True
                          
#                             )

# # Grid Search with n-fold cross validation
# grid_model1 = GridSearchCV(model,grid,cv=folds)

# # Train classifier with optimal parameters
# grid_model1.fit(X,y)

# print("The best parameters across searched params:\n",grid_model1.best_params_)
# print("The best score across searched params:\n",grid_model1.best_score_) #MAE

In [None]:
X.shape

In [None]:
model = CatBoostRegressor(iterations=5000,
                            learning_rate=0.04,
                            bootstrap_type='Bayesian',
                            boosting_type='Plain',
                            loss_function='MAE',
                            l2_leaf_reg = 5, # Added as Regularization
                            eval_metric='SMAPE',
#                           plot = True
#                           use_best_model = True
                            )

In [None]:
 model.fit(
        X, y, 
        early_stopping_rounds=1000,
        verbose=1000
    )

In [None]:
model.get_feature_importance(prettified=True)

In [None]:
yp = model.predict(X)

In [None]:
y.plot(figsize =(25, 8))
pd.Series(yp).plot(figsize = (25, 8), alpha=0.5)

In [None]:
# SMAPE
def smape(yp, y):
    return round(np.mean(np.abs(yp - y) / ((np.abs(yp) + np.abs(y))/2))*100, 5)

In [None]:
from scipy.special import inv_boxcox
y = inv_boxcox(y, lam)
yp = inv_boxcox(yp, lam)

In [None]:
# sns.scatterplot(np.expm1(y),np.expm1(yp))
sns.scatterplot(y,yp)

In [None]:
# sns.distplot(np.expm1(y-yp))
sns.distplot(y-yp)

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv', index_col = 'row_id')
df_test.head()

In [None]:
df_test['date'] = pd.to_datetime(df_test['date'])
df_test['day'] = df_test['date'].dt.day
df_test['month'] = df_test['date'].dt.month
df_test['year'] = df_test['date'].dt.year
df_test['weekday'] = df_test['date'].dt.dayofweek
df_test['quarter'] = df_test['date'].dt.quarter
df_test['yearday'] = df_test['date'].dt.dayofyear

In [None]:
holiday.head()

In [None]:
df_test['holiday'] = 0
fin_holiday = holiday.loc[holiday.country == 'Finland']
swe_holiday = holiday.loc[holiday.country == 'Sweden']
nor_holiday = holiday.loc[holiday.country == 'Norway']
df_test['fin holiday'] = df_test.date.isin(fin_holiday.date).astype(int)
df_test['swe holiday'] = df_test.date.isin(swe_holiday.date).astype(int)
df_test['nor holiday'] = df_test.date.isin(nor_holiday.date).astype(int)
df_test.loc[df_test.country == 'Finland', 'holiday'] = df_test.loc[df_test.country == 'Finland', 'fin holiday']
df_test.loc[df_test.country == 'Sweden', 'holiday'] = df_test.loc[df_test.country == 'Sweden', 'swe holiday']
df_test.loc[df_test.country == 'Norway', 'holiday'] = df_test.loc[df_test.country == 'Norway', 'nor holiday']


In [None]:
# df_test[(df_test['holiday']==1) | (df_test['weekday'] == 5) | (df_test['weekday'] == 6)]['holiday'] = 1

In [None]:
df_test.head()

In [None]:
df_test['gdpc']=0
for i in range(df_test['year'].shape[0]):
    df_test['gdpc'].iloc[i] = gdpc[gdpc['year'] == df_test['year'].iloc[i]][df_test['country'].iloc[i]]

In [None]:
df_test[['year','gdpc']] = sc.transform(df_test[['year', 'gdpc']])

In [None]:
df_test = fourier(df_test)

In [None]:
df_test = get_interactions(df_test)

In [None]:
df_test = pd.get_dummies(df_test, columns = col, drop_first = True)

In [None]:
df_test.drop(['date'], axis = 1, inplace = True)
df_test.head()

In [None]:
df_test.columns

In [None]:
y_pred = model.predict(df_test)

In [None]:
df_test.shape

In [None]:
output = np.ceil(inv_boxcox(y_pred, lam))

In [None]:
data1 = pd.DataFrame({'row_id': df_test.index,
                       'num_sold': output})

In [None]:
data1.head()

In [None]:
data1.to_csv('submission.csv', index = False)