# TPS January

In [None]:
# Core
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import combinations
import statistics
import time
from datetime import datetime
import matplotlib.dates as mdates

# Sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

# Models
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Tensorflow
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks

**Loading the data**

In [None]:
# Save to df
train_data=pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv', index_col='row_id')
test_data=pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv', index_col='row_id')

# Shape and preview
print('Training data df shape:',train_data.shape)
print('Test data df shape:',test_data.shape)
train_data.head()

In [None]:
# concise summary of dataset
train_data.info()

In [None]:
#LOOK AT THE COLUMNS OF TRAIN DATASET AND TEST DATASET.
print(train_data.columns)
print(test_data.columns)

**Missing Values**

In [None]:
train_data.isnull().sum()

**Timeframe of test and train data**

In [None]:
print('Training data:')
print('Min date', train_data['date'].min())
print('Max date', train_data['date'].max())
print('')
print('Test data:')
print('Min date', test_data['date'].min())
print('Max date', test_data['date'].max())

# Exploratory Data Analysis

**plotting the data**

In [None]:
sns.displot(data=train_data,x='num_sold')

In [None]:
sns.countplot(x="country",data=train_data)

In [None]:
sns.countplot(x="store",data=train_data)

In [None]:
sns.countplot(x="product",data=train_data)

**Sales by country**

In [None]:
'''KaggleMart'''
# Find number of products sold in each day, according to country
train_groupby_finland_KM=train_data[(train_data.country=='Finland') & (train_data['store']=='KaggleMart')].groupby('date').agg(num_sold=('num_sold','sum'))
train_groupby_norway_KM=train_data[(train_data.country=='Norway') & (train_data['store']=='KaggleMart')].groupby('date').agg(num_sold=('num_sold','sum'))
train_groupby_sweden_KM=train_data[(train_data.country=='Sweden') & (train_data['store']=='KaggleMart')].groupby('date').agg(num_sold=('num_sold','sum'))

'''KaggleRama'''
# Find number of products sold in each day, according to country
train_groupby_finland_KR=train_data[(train_data.country=='Finland') & (train_data['store']=='KaggleRama')].groupby('date').agg(num_sold=('num_sold','sum'))
train_groupby_norway_KR=train_data[(train_data.country=='Norway') & (train_data['store']=='KaggleRama')].groupby('date').agg(num_sold=('num_sold','sum'))
train_groupby_sweden_KR=train_data[(train_data.country=='Sweden') & (train_data['store']=='KaggleRama')].groupby('date').agg(num_sold=('num_sold','sum'))

In [None]:
# Figure with 2 subplots
fig, axes = plt.subplots(2, 1, figsize=(12, 10))

# Overall title
fig.suptitle('Total sales according to country')

# Subplot 1
sns.lineplot(ax=axes[0], data=train_groupby_finland_KM, x='date',y='num_sold', label='Finland')
sns.lineplot(ax=axes[0], data=train_groupby_norway_KM, x='date',y='num_sold', label='Norway')
sns.lineplot(ax=axes[0], data=train_groupby_sweden_KM, x='date',y='num_sold', label='Sweden')
axes[0].set_title('\n Kaggle Mart')
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.legend()

# Subplot 2
sns.lineplot(ax=axes[1], data=train_groupby_finland_KR, x='date',y='num_sold', label='Finland')
sns.lineplot(ax=axes[1], data=train_groupby_norway_KR, x='date',y='num_sold', label='Norway')
sns.lineplot(ax=axes[1], data=train_groupby_sweden_KR, x='date',y='num_sold', label='Sweden')
axes[1].set_title('\n Kaggle Rama')
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.legend()

**Sales by Product Type**

In [None]:
'''KaggleMart'''
# Find number of products sold in each day, according to product
train_groupby_mug_KM=train_data[(train_data['product']=='Kaggle Mug') & (train_data['store']=='KaggleMart')].groupby('date').agg(num_sold=('num_sold','sum'))
train_groupby_hat_KM=train_data[(train_data['product']=='Kaggle Hat') & (train_data['store']=='KaggleMart')].groupby('date').agg(num_sold=('num_sold','sum'))
train_groupby_sticker_KM=train_data[(train_data['product']=='Kaggle Sticker') & (train_data['store']=='KaggleMart')].groupby('date').agg(num_sold=('num_sold','sum'))

'''KaggleRama'''
# Find number of products sold in each day, according to country
train_groupby_mug_KR=train_data[(train_data['product']=='Kaggle Mug') & (train_data['store']=='KaggleRama')].groupby('date').agg(num_sold=('num_sold','sum'))
train_groupby_hat_KR=train_data[(train_data['product']=='Kaggle Hat') & (train_data['store']=='KaggleRama')].groupby('date').agg(num_sold=('num_sold','sum'))
train_groupby_sticker_KR=train_data[(train_data['product']=='Kaggle Sticker') & (train_data['store']=='KaggleRama')].groupby('date').agg(num_sold=('num_sold','sum'))

In [None]:
# Figure with 2 subplots
fig, axes = plt.subplots(2, 1, figsize=(12, 10))

# Overall title
fig.suptitle('Total sales according to product type', fontsize=15)

# Subplot 1
sns.lineplot(ax=axes[0], data=train_groupby_mug_KM, x='date',y='num_sold', label='Mug')
sns.lineplot(ax=axes[0], data=train_groupby_hat_KM, x='date',y='num_sold', label='Hat')
sns.lineplot(ax=axes[0], data=train_groupby_sticker_KM, x='date',y='num_sold', label='Sticker')
axes[0].set_title('\n Kaggle Mart')
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.legend()

# Subplot 2
sns.lineplot(ax=axes[1], data=train_groupby_mug_KR, x='date',y='num_sold', label='Mug')
sns.lineplot(ax=axes[1], data=train_groupby_hat_KR, x='date',y='num_sold', label='Hat')
sns.lineplot(ax=axes[1], data=train_groupby_sticker_KR, x='date',y='num_sold', label='Sticker')
axes[1].set_title('\n Kaggle Rama')
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.legend()

# Feature Engineering

In [None]:
# Labels
y=train_data.num_sold

# Features
X=train_data.drop('num_sold', axis=1)

# Convert date to 'actual' datetime
X.date=pd.to_datetime(X.date)
test_data.date=pd.to_datetime(test_data.date)

**Finding the public holidays of Finland, Norway,Sweden**

In [None]:
holiday_path = '../input/holiday-and-special-day/Holidays_Finland_Norway_Sweden_2015-2019_edit.csv'

def GetHoliday(holiday_path, df):
    """
    Get a boolean feature of whether the current row is a holiday sale
    """
    
    holiday = pd.read_csv(holiday_path)
    fin_holiday = holiday.loc[holiday.Country == 'Finland']
    swe_holiday = holiday.loc[holiday.Country == 'Sweden']
    nor_holiday = holiday.loc[holiday.Country == 'Norway']
    df['fin holiday'] = df.date.isin(fin_holiday.Date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.Date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.Date).astype(int)
    
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    return df

#X = GetHoliday(holiday_path, X)
#test_data = GetHoliday(holiday_path, test_data)

**All Holidays including public and unofficial**

In [None]:
hol_path = '../input/public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv'

def unofficial_hol(hol_path, df):
    countries = {'Finland': 1, 'Norway': 2, 'Sweden': 3}
    stores = {'KaggleMart': 1, 'KaggleRama': 2}
    products = {'Kaggle Mug': 1,'Kaggle Hat': 2, 'Kaggle Sticker': 3}
    
    # load holiday info.
    holiday = pd.read_csv(hol_path)
    
    fin_holiday = holiday.loc[holiday.country == 'Finland']
    swe_holiday = holiday.loc[holiday.country == 'Sweden']
    nor_holiday = holiday.loc[holiday.country == 'Norway']
    df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    
    return df

X = unofficial_hol(hol_path, X)
test_data = unofficial_hol(hol_path, test_data)

In [None]:
def date_feat_eng(df):
    #df['day_dummy'] = np.arange(len(df.index))      # 0, 1, 2...
    df['day_of_week']=df['date'].dt.dayofweek       # 0 to 6
    df['day_of_month']=df['date'].dt.day            # 1 to 31
    df['weekend']=(df['day_of_week']//5 == 1)       # 0 or 1
    df['weekend']=df['weekend'].astype('int')       # int64
    df['week']=df['date'].dt.isocalendar().week     # 1 to 53
    df['week'][df['week']>52]=52                    # 1 to 52
    df['week']=df['week'].astype('int')             # int64
    df['month']=df['date'].dt.month                 # 1 to 12
    df['quarter']=df['date'].dt.quarter             # 1 to 4
    df['year']=df['date'].dt.year                   # 2015 to 2019
    df.drop('date',axis=1, inplace=True)            # drop date
    return df

X= date_feat_eng(X)
test_data=date_feat_eng(test_data)


**Gross Domestic Product(GDP)**

In [None]:
# Load data
GDP_data = pd.read_csv("../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv",index_col="year")

# Rename the columns in GDP df 
GDP_data.columns = ['Finland', 'Norway', 'Sweden']

# Plot data
plt.figure(figsize=(8,5))

# Heatmap with annotations
sns.heatmap(GDP_data, annot=True, fmt='g', cmap='Blues')

# Aesthetics
plt.title('Heatmap of GDP in nordic countries')

In [None]:
# Create a dictionary
GDP_dictionary = GDP_data.unstack().to_dict()

# Create new GDP column
#X['GDP'] = X.set_index(['country', 'year']).index.map(GDP_dictionary.get)
#test_data['GDP'] = test_data.set_index(['country', 'year']).index.map(GDP_dictionary.get)

**GDP Per capita**

In [None]:
# Load data
GDP_PC=pd.read_csv('../input/gdp-per-capita-finland-norway-sweden-201519/GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv',index_col="year")

# Similar to above (GDP)
GDP_PC_dictionary = GDP_PC.unstack().to_dict()

# Create new GDP_PC column
X['GDP_PC'] = X.set_index(['country', 'year']).index.map(GDP_PC_dictionary.get)
test_data['GDP_PC'] = test_data.set_index(['country', 'year']).index.map(GDP_PC_dictionary.get)

X.head()

**Encode categorical value**

In [None]:
X=pd.get_dummies(X, columns=['store', 'country', 'product'])
test_data=pd.get_dummies(test_data, columns=['store', 'country', 'product'])

# Modelling

In [None]:
'''
# Break off a validation set (in time-series-split style)
X_train=X.iloc[:3*len(X)//4,:]
X_valid=X.iloc[3*len(X)//4:,:]
y_train=y.iloc[:3*len(X)//4]
y_valid=y.iloc[3*len(X)//4:]

# Base model
model=LGBMRegressor(random_state=0, n_estimators=200, max_depth=6)

# Train model
model.fit(X_train,y_train)

# Predict
preds = model.predict(X_valid)

# Calcaculate smape
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

# Evaluate smape
smape(preds,y_valid)
'''

In [None]:
# Store results from experiments
smape_results=pd.DataFrame.from_dict({'Method':['base','include holidays','date feat. eng. (FE)', 'holidays + date FE', 
                                                'prev. row + GDP (model A)', 'model A + weekend', 'model A + day dummy',
                                                'model A + unofficial holidays', 'prev. row + GDP per capita', 'GDP per capita instead of GDP'],
                                      'SMAPE': [16.52,16.46,9.06, 8.94, 9.02, 9.02, 21.97, 9.00, 8.97, 7.82]})
smape_results

In [None]:
# Parameter grid
grid = {'n_estimators': [50, 75, 100, 125, 150, 175, 200, 225, 250],
        'max_depth': [2, 4, 6, 8, 10, 12],
        'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15]}

# XGBoost model
model=LGBMRegressor(random_state=0)

# Grid Search with n-fold cross validation
grid_model = GridSearchCV(model,grid,cv=5)

# Train classifier with optimal parameters
grid_model.fit(X,y)

**Results from Grid  search**

In [None]:
print("The best parameters across ALL searched params:\n",grid_model.best_params_)
print("\n The best score across ALL searched params:\n",grid_model.best_score_) # r^2 score

# Prediction

In [None]:
# Make predictions
preds_test = np.ceil(grid_model.predict(test_data)) # ceil suggested by Carl

# Save predictions to file
output = pd.DataFrame({'row_id': test_data.index,
                       'num_sold': preds_test})

# Check format
output.head()

In [None]:
output.to_csv('submission.csv', index=False)