In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Basic EDA

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
sample_submit = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.isnull().sum()

In [None]:
train['country'].value_counts()

In [None]:
train['store'].value_counts()

In [None]:
train['product'].value_counts()

In [None]:
train.describe()

In [None]:
test.head()

In [None]:
test.shape

In [None]:
test.isnull().sum()

In [None]:
test['country'].value_counts()

In [None]:
test['store'].value_counts()

In [None]:
test['product'].value_counts()

In [None]:
test.describe()

# Feature Engineering

In [None]:
data = pd.concat([train, test], axis=0, ignore_index=True)
data

In [None]:
from datetime import datetime, date

data['date'] = pd.to_datetime(data.date, format = '%Y/%m/%d')
data

## Holidays features

In [None]:
import holidays

holiday_FI = holidays.CountryHoliday('FI', years=[2015, 2016, 2017, 2018, 2019])

holiday_FI

In [None]:
data.loc[data['country'] == 'Finland', 'holiday_name'] = data[data['country'] == 'Finland']['date'].map(holiday_FI)

data

In [None]:
holiday_NO = holidays.CountryHoliday('NO', years=[2015, 2016, 2017, 2018, 2019])
holiday_NO

In [None]:
data.loc[data['country'] == 'Norway', 'holiday_name'] = data[data['country'] == 'Norway']['date'].map(holiday_NO)

data

In [None]:
holiday_SE = holidays.CountryHoliday('SE', years=[2015, 2016, 2017, 2018, 2019])
holiday_SE

In [None]:
data.loc[data['country'] == 'Sweden', 'holiday_name'] = data[data['country'] == 'Sweden']['date'].map(holiday_SE)

data

In [None]:
data['is_holiday'] = np.where(data['holiday_name'].notnull(), 1, 0)
data['is_holiday'].value_counts()

In [None]:
gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv',
                    index_col='year')

gdp_exponent = 1.2121103201489674 # see https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model for an explanation

def get_gdp(row):
    """Return the GDP based on row.country and row.date.year"""
    country = 'GDP_' + row.country
    return gdp_df.loc[row.date.year, country] ** gdp_exponent

temp = pd.DataFrame(data.apply(get_gdp, axis=1))

In [None]:
data['GDP'] = temp
temp

In [None]:
data

In [None]:
import dateutil.easter as easter
def engineer(df):
    """Return a new dataframe with the engineered features"""
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df['days_from_easter'] = (df.date - easter_date).dt.days.clip(-5, 65)
    
    # Last Sunday of May (Mother's Day)
    sun_may_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-5-31')),
                                         2016: pd.Timestamp(('2016-5-29')),
                                         2017: pd.Timestamp(('2017-5-28')),
                                         2018: pd.Timestamp(('2018-5-27')),
                                         2019: pd.Timestamp(('2019-5-26'))})
    #new_df['days_from_sun_may'] = (df.date - sun_may_date).dt.days.clip(-1, 9)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    df['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    # First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    return df

In [None]:
data = engineer(data)
data

## Convert the Categorical variables to encodering

In [None]:
from sklearn import preprocessing

# （2）Convert the Categorical variables to encodering
cat = ['country', 'store', 'product', 'holiday_name']
for feature in cat:
    encoder = preprocessing.LabelEncoder()
    data[feature] = data[feature].astype('str') 
    data[feature] = encoder.fit_transform(data[feature])
data

## Time features

In [None]:
data['year'] = data['date'].dt.year 
data['quarter'] = data['date'].dt.quarter
data['month'] = data['date'].dt.month  
data['week'] = data['date'].dt.isocalendar().week.astype('int64')
data['day'] = data['date'].dt.day  
data['weekday'] = data['date'].dt.weekday
data['dayofweek'] = data['date'].dt.dayofweek  
data['dayofyear'] = data['date'].dt.dayofyear  
data['weekofyear'] = data['date'].dt.isocalendar().week.astype('int64')
data['dayofmonth'] = data['date'].dt.days_in_month  
data['is_weekend'] = np.where((data['weekday'] == 5) | (data['weekday'] == 6), 1, 0)
data['is_Firday'] = np.where((data['weekday'] == 4), 1, 0)
data

## Show features

In [None]:
labels = list(data.columns.values)
labels

# Data split

In [None]:
labels.remove('row_id')
labels.remove('date')
labels.remove('num_sold')
feature = labels

In [None]:
train = data[data['date'] <= '2017-12-31']
x_train = train[feature]
y_train = np.log(train['num_sold'] / train['GDP'])
x_train

In [None]:
val = data[(data['date'] > '2017-12-31') & (data['date'] <= '2018-12-31')]
x_val = val[feature]
y_val = np.log(val['num_sold'] / val['GDP'])
x_val

In [None]:
test = data[(data['date'] > '2018-12-31')]
x_test = test[feature]
x_test

In [None]:
x_train['weekofyear'].dtypes

# XGBoost / LightGBM

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

## XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

data_train = xgb.DMatrix(x_train, y_train)
data_val = xgb.DMatrix(x_val, y_val)

paras = {
        'tree_method':'gpu_hist',
        'grow_policy' : 'lossguide',
        'learning_rate': 0.03399878704233446,
        'max_depth': 5,
        'reg_alpha': 0.7814373604498039,
        'reg_lambda': 0.00018093104956619317,
        'max_delta_step': 2,
        'min_child_weight': 14,
        'colsample_bytree': 0.6489299778623602,
        'subsample': 0.6033298718112065,
        'max_leaves': 187,
        }
watchlist = [(data_train,'train'),(data_val,'val')]
n_round = 20000
model = xgb.train(dict(paras),data_train,num_boost_round = n_round,evals = watchlist,
                                 early_stopping_rounds=8000)
data_test = xgb.DMatrix(x_test)
y_sub_1 = model.predict(data_test)
data_val_test = xgb.DMatrix(x_val)
y_val_hat_1 = model.predict(data_val_test)
mean_squared_error_all = mean_squared_error(y_val, y_val_hat_1)
print(f'mean_squared_error_all: {mean_squared_error_all}')
print(f'SMAPE: {SMAPE(np.exp(y_val)* x_val.GDP.values, np.exp(y_val_hat_1)* x_val.GDP.values)}')
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = model.get_score().keys()
fold_importance_df["importance"] = model.get_score().values()

## XGBoost - plot feature importance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

N = 30 # TOP 30
cols = (fold_importance_df[["feature", "importance"]] .groupby("feature").mean().sort_values(by="importance" , ascending=False)[:N].index)
best_features = fold_importance_df.loc[fold_importance_df.feature.isin(cols)].sort_values(by='importance',ascending=False)
plt.figure(figsize=(8, 15))
sns.barplot(y="feature", x="importance",data=best_features.sort_values(by="importance", ascending=False))
plt.title('XGB Features (avg over folds)')
plt.tight_layout()
plt.savefig('Xgb_importances_weight.png')
plt.show()

## LightGBM

In [None]:
import lightgbm as lgb

data_train = lgb.Dataset(x_train, y_train)
data_val = lgb.Dataset(x_val, y_val)

param = {
    'objective': 'regression',
    'force_row_wise': True,
    'max_bin': 400,
    'verbosity': -1,
    'seed': 1,
    'bagging_seed': 10,
    'feature_fraction_seed': 2,
    'learning_rate': 0.019157894736842106,
    'lambda_l1': 0,
    'lambda_l2': 0.01,
    'num_leaves': 22,
    'feature_fraction': 0.6689655172413793,
    'bagging_fraction': 0.4275862068965517,
    'bagging_freq': 6,
    'min_child_samples': 18,
}
watchlist = [data_train, data_val]
n_round = 20000
model = lgb.train(dict(param), data_train, num_boost_round=n_round, valid_sets=watchlist,early_stopping_rounds=8000)
data_test = x_test
y_sub_2 = model.predict(data_test)
data_val_test = x_val
y_val_hat_2 = model.predict(data_val_test)
mean_squared_error_all = mean_squared_error(y_val, y_val_hat_2)
print(f'mean_squared_error_all: {mean_squared_error_all}')
print(f'SMAPE: {SMAPE(np.exp(y_val)* x_val.GDP.values, np.exp(y_val_hat_2)* x_val.GDP.values)}')
fold_importance_df2 = pd.DataFrame()
fold_importance_df2["feature"] = model.feature_name()
fold_importance_df2["importance"] = model.feature_importance()

## LightGBM - plot feature importance

In [None]:
N = 30 # TOP 30
cols = (fold_importance_df2[["feature", "importance"]] .groupby("feature").mean().sort_values(by="importance", ascending=False)[:N].index)
best_features = fold_importance_df2.loc[fold_importance_df2.feature.isin(cols)].sort_values(by='importance', ascending=False)
plt.figure(figsize=(8, 15))
sns.barplot(y="feature", x="importance",data=best_features.sort_values(by="importance", ascending=False))
plt.title('lgb Features (avg over folds)')
plt.tight_layout()
plt.savefig('Lgb_importances_weight.png')
plt.show()

In [None]:
y_sub_1_all = np.exp(y_sub_1) * x_test['GDP'].values
y_sub_2_all = np.exp(y_sub_2) * x_test['GDP'].values

sub1 = y_sub_1_all * 0.3 + y_sub_2_all *  0.7
sub2 = y_sub_1_all * 0.2 + y_sub_2_all *  0.8
sub3 = y_sub_1_all * 0.4 + y_sub_2_all *  0.6

sample_submit['num_sold'] = sub1
sample_submit['num_sold'] = np.round(sample_submit['num_sold']).astype(int)
sample_submit.to_csv('result_STACKING_PubV20307.csv', index=False)

sample_submit['num_sold'] = sub2
sample_submit['num_sold'] = np.round(sample_submit['num_sold']).astype(int)
sample_submit.to_csv('result_STACKING_PubV20208.csv', index=False)

sample_submit['num_sold'] = sub3
sample_submit['num_sold'] = np.round(sample_submit['num_sold']).astype(int)
sample_submit.to_csv('result_STACKING_PubV20406.csv', index=False)

In [None]:
sample_submit['num_sold'] = y_sub_1_all
sample_submit.to_csv('result_PubV2_xgb.csv', index=False) 

In [None]:
sample_submit['num_sold'] = y_sub_2_all
sample_submit.to_csv('result_PubV2_Lgb.csv', index=False)  