## Imports

In [None]:
import numpy as np
import pandas as pd
import os
import holidays

import optuna
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import matplotlib
from matplotlib import pyplot as plt

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 6)

## Constants for data paths

In [None]:
DATA_PATH = '../input/tabular-playground-series-jan-2022/'
TRAIN_PATH = DATA_PATH + 'train.csv'
TEST_PATH = DATA_PATH + 'test.csv'
PURCHASING_POWER_PATH = '../input/purchasing-power-of-norway-sweden-finland/purchasing_power.csv' # purchasing power index
COST_OF_LIVING_PATH = '../input/cost-of-living-index-norway-sweden-finland/cost_of_living.csv' # cost of living index
QUALITY_OF_LIFE_PATH = '../input/quality-of-life-norway-sweden-finland/quality_of_life.csv' # quality of life index
GDP_PATH = '../input/gdp-20152019-finland-sweden-norway/gdp.csv'

TARGET_COLUMN = 'num_sold'

## SMAPE loss function

In [None]:
def SMAPE(y_true, y_pred):
    """SMAPE Loss"""
    return np.mean(np.abs(y_true - y_pred) / (y_true + np.abs(y_pred))) * 200

## Adding extra features

In [None]:
purchasing_power = pd.read_csv(PURCHASING_POWER_PATH)
cost_of_living = pd.read_csv(COST_OF_LIVING_PATH)
quality_of_life = pd.read_csv(QUALITY_OF_LIFE_PATH)
gdp = pd.read_csv(GDP_PATH)

In [None]:
def preprocess_dataset(dataset):
    dataset['date'] = pd.to_datetime(dataset['date'])
    
    dataset['day'] = dataset['date'].dt.day
    dataset['month'] = dataset['date'].dt.month
    dataset['weekday'] = dataset['date'].dt.weekday
    dataset['year'] = dataset['date'].dt.year
    
    dataset = pd.merge(dataset, purchasing_power, on=['year', 'country'], how='left')
    dataset = pd.merge(dataset, quality_of_life, on=['year', 'country'], how='left')
    dataset = pd.merge(dataset, cost_of_living, on=['year', 'country'], how='left')
    dataset = pd.merge(dataset, gdp, on=['year', 'country'], how='left')
    
    return dataset

In [None]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

train_data = train_data.drop(['row_id'], axis=1)

train_data = preprocess_dataset(train_data)
test_data = preprocess_dataset(test_data)

In [None]:
train_data.head()

## Defining DataFrame with holidays

In [None]:
finland_holidays = holidays.Finland()['2015-01-01': '2019-12-31']
finland_holidays_df = pd.DataFrame({'date': pd.to_datetime(finland_holidays), 'country': 'Finland',
                                    'holiday': 1})

norway_holidays = holidays.Norway()['2015-01-01': '2019-12-31']
norway_holidays_df = pd.DataFrame({'date': pd.to_datetime(norway_holidays), 'country': 'Norway',
                                    'holiday': 1})

sweden_holidays = holidays.Sweden()['2015-01-01': '2019-12-31']
sweden_holidays_df = pd.DataFrame({'date': pd.to_datetime(sweden_holidays), 'country': 'Sweden',
                                    'holiday': 1})

holidays_df = holidays_df = pd.concat([finland_holidays_df, norway_holidays_df, sweden_holidays_df])

In [None]:
holidays_df.head()

In [None]:
X_public = train_data.drop([TARGET_COLUMN], axis=1)
y_public = train_data[TARGET_COLUMN]

X_public = pd.merge(X_public, holidays_df, on=['date', 'country'], how='outer')

X_public = X_public.drop(['date'], axis=1)
X_public = pd.get_dummies(X_public)
X_public = X_public[X_public.day.notna()].fillna(0)

print(X_public.shape)
X_public.tail()

## Optuna objective function for XGBoost hyperparameters optimization

In [None]:
def objective(trial):
    kfold = KFold(n_splits=5)
    
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 5)
    subsample = trial.suggest_float('subsample', 0.5, 1)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 2)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 2)
    
    loss_history = []
    for train_index, test_index in kfold.split(X_public):
        X_train, y_train = X_public.iloc[train_index], np.log(y_public.iloc[train_index])
        X_test, y_test = X_public.iloc[test_index], np.log(y_public.iloc[test_index])
        
        xgb_regressor = XGBRegressor(learning_rate=learning_rate,
                                max_depth=max_depth,
                                min_child_weight=min_child_weight,
                                subsample=subsample,
                                reg_lambda=reg_lambda,
                                reg_alpha=reg_alpha,
                                objective='reg:squarederror')
        
        xgb_regressor.fit(X_train, y_train)
        loss_history.append(SMAPE(xgb_regressor.predict(X_test), y_test))
        
    return np.mean(loss_history)

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

## Training the model with the best hyperparameters

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_public, y_public, random_state=42, test_size=0.2)

xgb_regressor = XGBRegressor(**study.best_params)
xgb_regressor.fit(X_train, np.log(y_train))

In [None]:
SMAPE(np.exp(xgb_regressor.predict(X_test)), y_test)

## Inference

In [None]:
X_private = test_data.drop(['row_id'], axis=1)
ids = test_data['row_id']

X_private = pd.merge(X_private, holidays_df, on=['date', 'country'], how='outer')

X_private = X_private.drop(['date'], axis=1)
X_private = pd.get_dummies(X_private)
X_private = X_private[X_private.day.notna()].fillna(0)

print(X_private.shape)
X_private.sample(5)

In [None]:
boosting_prediction = np.round(np.exp(xgb_regressor.predict(X_private)))

predictions = boosting_prediction
pd.DataFrame({'row_id': ids, 'num_sold': predictions}).set_index('row_id').to_csv('predictions.csv')