In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

sns.set(style='darkgrid', font_scale=1.4)

import matplotlib.pyplot as plt
%matplotlib inline
from itertools import combinations
import math
import statistics
import scipy.stats
from scipy.stats import pearsonr
import time
from datetime import datetime
import matplotlib.dates as mdates
import dateutil.easter as easter
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, Ridge

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv', index_col='row_id')
print(train_data.dtypes)
print(train_data.date.dtype)
train_data.info()
train_data

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv', index_col='row_id')
print(test_data.date.dtype)
test_data

In [None]:
print(train_data.date.dtype)
train_data.date = pd.to_datetime(train_data.date)
print(train_data.date.dtype)
train_data

In [None]:
test_data.date = pd.to_datetime(test_data.date)
print(test_data.date.dtype)

In [None]:
print(train_data[(train_data.date.dt.month == 2) & (train_data.date.dt.day == 29)])

In [None]:
train_data.iloc[7632:7650, 0] = pd.to_datetime('2016-03-01')
train_data.iloc[7632:7650, 0]

In [None]:
print(test_data[(test_data.date.dt.month == 2) & (test_data.date.dt.day == 29)])

In [None]:
# train_data.drop(train_data[(train_data.date.dt.month==2) & (train_data.date.dt.day==29)].index, axis=0, inplace=True)

In [None]:
plt.figure(figsize=(12, 5))
aa = train_data.groupby(['date', 'store']).agg(num_sold=('num_sold','sum'))

sns.lineplot(data=aa, x='date', y='num_sold', hue='store')

plt.title('NUM_SOLD BY STORE')

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(12, 10))

KR = train_data[train_data.store=='KaggleRama']
KM = train_data[train_data.store=='KaggleMart']

bb = KR.groupby(['date', 'product']).agg(num_sold=('num_sold', 'sum'))
cc = KM.groupby(['date', 'product']).agg(num_sold=('num_sold', 'sum'))

ax1 = sns.lineplot(ax=axes[0], data=bb, x='date', y='num_sold', hue='product')
ax2 = sns.lineplot(ax=axes[1], data=cc, x='date', y='num_sold', hue='product')

ax1.title.set_text('KAGGLE RAMA')
ax2.title.set_text('KAGGLE MART')

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(12, 10))

dd = KR.groupby(['date', 'country']).agg(num_sold=('num_sold', 'sum'))
ee = KM.groupby(['date', 'country']).agg(num_sold=('num_sold', 'sum'))

ax1 = sns.lineplot(ax=axes[0], data=dd, x='date', y='num_sold', hue='country')
ax2 = sns.lineplot(ax=axes[1], data=ee, x='date', y='num_sold', hue='country')

ax1.title.set_text('KAGGLE RAMA')
ax2.title.set_text('KAGGLE MART')

In [None]:
y = train_data.num_sold
X = train_data.drop('num_sold', axis=1)

In [None]:
def unofficial_hol(df):
    countries = {
        'Finland': 1,
        'Norway': 2,
        'Sweden': 3
    }
    
    stores = {
        'KaggleMart': 1,
        'KaggleRama': 2
    }
    
    product = {
        'Kaggle Mug': 1,
        'Kaggle Hat': 2,
        'Kaggle Sticker': 3
    }
    
    hol_path = '../input/public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv'
    holiday = pd.read_csv(hol_path)
    
    fin_holiday = holiday.loc[holiday.country=='Finland']
    swe_holiday = holiday.loc[holiday.country=='Sweden']
    nor_holiday = holiday.loc[holiday.country=='Norway']
    
    df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    df.loc[df.country=='Finland', 'holiday'] = df.loc[df.country=='Finland', 'fin holiday']
    df.loc[df.country=='Sweden', 'holiday'] = df.loc[df.country=='Sweden', 'swe holiday']
    df.loc[df.country=='Norway', 'holiday'] = df.loc[df.country=='Norway', 'nor holiday']
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    
    return df

In [None]:
def get_holidays(df):
    df = pd.concat([df,
                    pd.DataFrame({
                        f'dec{d}': (df.date.dt.month == 12) & (df.date.dt.day == d) for d in range(24, 32)
                    }),
                    pd.DataFrame({
                        f'n-dec{d}': (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway') for d in range(24, 32)
                    }),
                    pd.DataFrame({
                        f'f-jan{d}': (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(1, 14)
                    }),
                    pd.DataFrame({
                        f'jan{d}': (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway') for d in range(1, 10)
                    }),
                    pd.DataFrame({
                        f's-jan{d}': (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden') for d in range(1, 15)
                    })],
                   axis=1)
    df = pd.concat([df,
                    pd.DataFrame({
                        f'may{d}': (df.date.dt.month == 5) & (df.date.dt.day == d) for d in list(range(1, 10))
                    }),
                    pd.DataFrame({
                        f'may{d}': (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway') for d in list(range(19, 26))
                    })], 
                   axis=1)
    df = pd.concat([df,
                    pd.DataFrame({
                        f'june{d}': (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden') for d in list(range(8, 14))
                    })],
                   axis=1)
    swed_rock_fest = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-06-6')),
        2016: pd.Timestamp(('2016-06-11')),
        2017: pd.Timestamp(('2017-06-10')),
        2018: pd.Timestamp(('2018-06-10')),
        2019: pd.Timestamp(('2019-06-8'))
    })
    df = pd.concat([df,
                    pd.DataFrame({
                        f'swed_rock_fest{d}': (df.date - swed_rock_fest == np.timedelta64(d, 'D')) & (df.country == 'Sweden') for d in list(range(-3, 3))
                    })], 
                   axis=1)
    wed_june_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-06-24')),
        2016: pd.Timestamp(('2016-06-29')),
        2017: pd.Timestamp(('2017-06-28')),
        2018: pd.Timestamp(('2018-06-27')),
        2019: pd.Timestamp(('2019-06-26'))
    })
    df = pd.concat([df,
                    pd.DataFrame({
                        f'wed_june{d}': (df.date - wed_june_date == np.timedelta64(d, 'D')) & (df.country != 'Norway') for d in list(range(-4, 6))
                    })], 
                   axis=1)
    sun_nov_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-11-1')),
        2016: pd.Timestamp(('2016-11-6')),
        2017: pd.Timestamp(('2017-11-5')),
        2018: pd.Timestamp(('2018-11-4')),
        2019: pd.Timestamp(('2019-11-3'))
    })
    df = pd.concat([df,
                    pd.DataFrame({
                        f'sun_nov{d}': (df.date - sun_nov_date == np.timedelta64(d, 'D')) & (df.country != 'Norway') for d in list(range(0, 9))
                    })],
                   axis=1)
    df = pd.concat([df,
                    pd.DataFrame({
                        f'dec{d}': (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland') for d in list(range(6, 14))
                    })],
                   axis=1)
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df = pd.concat([df,
                    pd.DataFrame({
                        f'easter{d}': (df.date - easter_date == np.timedelta64(d, 'D')) for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))
                    })],
                   axis=1)
    return df

In [None]:
def date_feat_eng_X1(df):
    df['year'] = df['date'].dt.year
    return df

def date_feat_eng_X2(df):
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_month'] = df['date'].dt.day
    df['dayofyear'] = df['date'].dt.dayofyear
    df.loc[(df.date.dt.year==2016) & (df.dayofyear>60), 'dayofyear'] -= 1
    df['week'] = df['date'].dt.isocalendar().week
    df['week'] = df['week'].astype('int')
    df['month'] = df['date'].dt.month
    return df

In [None]:
def get_GDP(df):
    GDP_data = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv', index_col='year')
    GDP_data.columns = ['Finland', 'Norway', 'Sweden']
    GDP_dictionary = GDP_data.unstack().to_dict()
    df['GDP'] = df.set_index(['country', 'year']).index.map(GDP_dictionary.get)
    df['GDP'] = np.log(df['GDP'])
    df['GDP_Finland'] = df['GDP'] * (df['country'] == 'Finland')
    df['GDP_Norway'] = df['GDP'] * (df['country'] == 'Norway')  
    df['GDP_Sweden'] = df['GDP'] * (df['country'] == 'Sweden')
    df = df.drop(['GDP', 'year'], axis=1)
    
    return df

In [None]:
def GDP_PC(df):
    GDP_PC_data = pd.read_csv('../input/gdp-per-capita-finland-norway-sweden-201519/GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv', index_col=year)
    GDP_PC_dictionary = GDP_PC_data.unstack().to_dict()
    df['GDP_PC'] = df.set_index(['country', 'year']).index.map(GDP_PC_dictionary.get)
    
    return df

In [None]:
def GDP_corr(df):
    GDP_data = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv', index_col='year')
    GDP_PC_data = pd.read_csv('../input/gdp-per-capita-finland-norway-sweden-201519/GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv', index_col='year')
    
    GDP_data.columns = ['Finland', 'Norway', 'Sweden']
    GDP_dictionary = GDP_data.unstack().to_dict()
    GDP_PC_dictionary = GDP_PC_data.unstack().to_dict()
    
    df['year'] = df.date.dt.year
    df['GDP'] = df.set_index(['country', 'year']).index.map(GDP_dictionary.get)
    df['GDP_PC'] = df.set_index(['country', 'year']).index.map(GDP_PC_dictionary.get)
    
    feat_corr = []
    
    for SS in ['KaggleMart', 'KaggleRama']:
        for CC in ['Finland', 'Norway', 'Sweden']:
            for PP in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
                subset = df[(df.store == SS) & (df.country == CC) & (df['product'] == PP)].groupby(['year']).agg(num_sold=('num_sold', 'sum'), GDP=('GDP', 'mean'), GDP_PC=('GDP_PC', 'mean'))
                v1=subset.num_sold
                v2=subset.GDP
                v3=subset.GDP_PC
                
                r1, _ = pearsonr(v1, v2)
                r2, _ = pearsonr(v1, v3)
                
                feat_corr.append([f'{SS}, {CC}, {PP}', r1, r2])
    
    return pd.DataFrame(feat_corr, columns=['Features', 'GDP_corr', 'CDP_PC_corr'])

corr_df = GDP_corr(train_data)
corr_df

In [None]:
def FourierFeatures(df):
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        df[product] = df['product'] == product
        
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 2):
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        df[f'mug_sin'] = df[f'sin{k}'] * df['Kaggle Mug']
        df[f'mug_cos'] = df[f'cos{k}'] * df['Kaggle Mug']
        df[f'hat_sin'] = df[f'sin{k}'] * df['Kaggle Hat']
        df[f'hat_cos'] = df[f'cos{k}'] * df['Kaggle Hat']
        
        df = df.drop([f'sin{k}', f'cos{k}'], axis=1)
        
    df = df.drop(['Kaggle Mug', 'Kaggle Hat'], axis=1)
    
    return df

In [None]:
def get_interactions(df):
    df['KR_Sweden_Mug'] = (df.country == 'Sweden') * (df.product == 'Kaggle Mug') * (df.store == 'KaggleRama')
    df['KR_Sweden_Hat'] = (df.country == 'Sweden') * (df.product == 'Kaggle Hat') * (df.store == 'KaggleRama')
    df['KR_Sweden_Sticker'] = (df.country == 'Sweden') * (df.product == 'Kaggle Sticker') * (df.store == 'KaggleRama')
    
    df['KR_Norway_Mug'] = (df.country == 'Norway') * (df.product == 'Kaggle Mug') * (df.store == 'KaggleRama')
    df['KR_Norway_Hat'] = (df.country == 'Norway') * (df.product == 'Kaggle Hat') * (df.store == 'KaggleRama')
    df['KR_Norway_Sticker'] = (df.country == 'Norway') * (df.product == 'Kaggle Sticker') * (df.store == 'KaggleRama')
    
    df['KR_Finland_Mug'] = (df.country == 'Finland') * (df.product == 'Kaggle Mug') * (df.store == 'KaggleRama')
    df['KR_Finland_Hat'] = (df.country == 'Finland') * (df.product == 'Kaggle Hat') * (df.store == 'KaggleRama')
    df['KR_Finland_Sticker'] = (df.country == 'Finland') * (df.product == 'Kaggle Sticker') * (df.store == 'KaggleRama')
    
    df['KR_Sweden_Mug'] = (df.country == 'Sweden') * (df.product == 'Kaggle Mug') * (df.store == 'KaggleMart')
    df['KR_Sweden_Hat'] = (df.country == 'Sweden') * (df.product == 'Kaggle Hat') * (df.store == 'KaggleMart')
    df['KR_Sweden_Sticker'] = (df.country == 'Sweden') * (df.product == 'Kaggle Sticker') * (df.store == 'KaggleMart')
    
    df['KR_Norway_Mug'] = (df.country == 'Norway') * (df.product == 'Kaggle Mug') * (df.store == 'KaggleMart')
    df['KR_Norway_Hat'] = (df.country == 'Norway') * (df.product == 'Kaggle Hat') * (df.store == 'KaggleMart')
    df['KR_Norway_Sticker'] = (df.country == 'Norway') * (df.product == 'Kaggle Sticker') * (df.store == 'KaggleMart')
    
    df['KR_Finland_Mug'] = (df.country == 'Finland') * (df.product == 'Kaggle Mug') * (df.store == 'KaggleMart')
    df['KR_Finland_Hat'] = (df.country == 'Finland') * (df.product == 'Kaggle Hat') * (df.store == 'KaggleMart')
    df['KR_Finland_Sticker'] = (df.country == 'Finland') * (df.product == 'Kaggle Sticker') * (df.store == 'KaggleMart')
    
    return df

In [None]:
def dropdate(df):
    df = df.drop('date', axis=1)
    return df

def onehot(df, columns):
    df = pd.get_dummies(df, columns)
    return df

In [None]:
def FeatEng_X1(df):
    df = date_feat_eng_X1(df)
    df = get_GDP(df)
    df = FourierFeatures(df)
    df = get_interactions(df)
    df = dropdate(df)
    df = onehot(df, ['store', 'product', 'country'])
    
    return df

def FeatEng_X2(df):
    df = date_feat_eng_X2(df)
    df = unofficial_hol(df)
    df = get_holidays(df)
    df = dropdate(df)
    df = onehot(df, ['store', 'product', 'country'])
    
    return df

X_train_1 = FeatEng_X1(X)
X_train_2 = FeatEng_X2(X)
X_test_1 = FeatEng_X1(test_data)
X_test_2 = FeatEng_X2(test_data)

In [None]:
class HybridModel:
    def __init__(self, model_1, model_2, grid=None):
        self.model_1 = model_1
        self.model_2 = model_2
        self.grid = grid
        
    def fit(self, X_train_1, X_train_2, y):
        self.model_1.fit(X_train_1, y)
        y_trend = self.model_1.predict(X_train_1)
        
        if self.grid:
            tscv = TimeSeriesSplit(n_splits=3)
            grid_model = GridSearchCV(estimator=self.model_2, cv=tscv, param_grid=self.grid)
            grid_model.fit(X_train_2, y - y_trend)
            
            y_resid = grid_model.predict(X_train_2)
            self.grid_model = grid_model
        else:
            self.model_2.fit(X_train_2, y - y_trend)
            y_resid = self.model_2.predict(X_train_2)
            
        self.y_train_trend = y_trend
        self.y_train_resid = y_resid
        
    def predict(self, X_test_1, X_test_2):
        y_trend = self.model_1.predict(X_test_1)
        if self.grid:
            y_resid = self.grid_model.predict(X_test_2)
        else:
            y_resid = self.model_2.predict(X_test_2)
            
        y_pred = y_trend + y_resid
        
        self.y_test_trend = y_trend
        self.y_test_resid = y_resid
        
        return y_pred

In [None]:
model_1 = LinearRegression()
models_2 = [LGBMRegressor(random_state=0), CatBoostRegressor(random_state=0, verbose=False), XGBRegressor(random_state=0)]

param_grid = {
    'n_estimators': [100, 150, 200, 225, 250, 275, 300],
    'max_depth': [4, 5, 6, 7],
    'learning_rate': [0.1, 0.12, 0.13, 0.14, 0.15]
}

y_pred = np.zeros(len(test_data))
train_preds = np.zeros(len(y))

for model_2 in models_2:
    start = time.time()
    
    model = HybridModel(model_1, model_2, grid=param_grid)
    model.fit(X_train_1, X_train_2, np.log(y))
    
    y_pred += np.exp(model.predict(X_test_1, X_test_2))
    train_preds += np.exp(model.y_train_trend + model.y_train_resid)
    
    stop = time.time()
    
    print(f'MODEL_2:{model_2} -- TIME:{round((stop-start)/60, 2)} mins')
    
    if model.grid:
        print('BEST PARAMETERS:', model.grid_model.best_params_, '\n')
        
y_pred /= len(models_2)
train_preds /= len(models_2)

In [None]:
def geometric_round(arr):
    result_array = arr
    result_array = np.where(result_array < np.sqrt(np.floor(arr) * np.ceil(arr)), np.floor(arr), result_array)
    result_array = np.where(result_array >= np.sqrt(np.floor(arr) * np.ceil(arr)), np.ceil(arr), result_array)
    
    return result_array

y_pred = geometric_round(y_pred)
output = pd.DataFrame({
    'row_id': test_data.index,
    'num_sold': y_pred
})

output

In [None]:
output.to_csv('submission_hybrid_01.csv', index=False)

In [None]:
def plot_predictions(SS, CC, PP, series=output):
    train_subset = train_data[(train_data.store == SS) & (train_data.country == CC) & (train_data['product'] == PP)]
    plot_index = test_data[(test_data.store == SS) & (test_data.country == CC) & (test_data['product'] == PP)].index
    
    pred_subset = series[series.row_id.isin(plot_index)].reset_index(drop=True)
    
    plt.figure(figsize=(12, 5))
    n1 = len(train_subset['num_sold'])
    n2 = len(pred_subset['num_sold'])
    plt.plot(np.arange(n1), train_subset['num_sold'], label='TRAINING')
    plt.plot(np.arange(n1, n1 + n2), pred_subset['num_sold'], label='PREDICTIONS')
    plt.title('\n' + f'STORE:{SS}, COUNTRY:{CC}, PRODUCT:{PP}')
    plt.legend()
    plt.xlabel('DAYS SINCE 2015-01-01')
    plt.ylabel('NUM_SOLD')

In [None]:
y_trend = pd.DataFrame({
    'row_id': test_data.index,
    'num_sold': np.exp(model.y_test_trend)
})
y_resid = pd.DataFrame({
    'row_id': test_data.index,
    'num_sold': np.exp(model.y_test_resid)
})
y_pred = pd.DataFrame({
    'row_id': test_data.index,
    'num_sold': np.exp(model.y_test_trend + model.y_test_resid)
})

SS = 'KaggleMart'
CC = 'Norway'

plot_predictions(SS, CC, 'Kaggle Hat', series=y_trend)
plot_predictions(SS, CC, 'Kaggle Mug', series=y_trend)
plot_predictions(SS, CC, 'Kaggle Sticker', series=y_trend)

In [None]:
for SS in ['KaggleMart', 'KaggleRama']:
    for CC in ['Finland', 'Norway', 'Sweden']:
        for PP in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
            plot_predictions(SS, CC, PP)

In [None]:
train_preds = np.exp(model.y_train_trend + model.y_train_resid)

residuals = 200 * (train_preds - y) / (train_preds + y)
plt.figure(figsize=(12, 4))
plt.scatter(np.arange(len(residuals)), residuals, s=1)
plt.hlines([0], 0, residuals.index.max(), color='k')
plt.title('RESIDUALS ON TRAINING SET')
plt.xlabel('SAMPLE')
plt.ylabel('SMAPE')

In [None]:
mu, std = scipy.stats.norm.fit(residuals)

plt.figure(figsize=(12, 4))
plt.hist(residuals, bins=100, density=True)
x = np.linspace(plt.xlim()[0], plt.xlim()[1], 200)
plt.plot(x, scipy.stats.norm.pdf(x, mu, std), 'r', linewidth=2)
plt.title(f'HISTOGRAM OF RESIDUALS: MEAN = {residuals.mean():.4f}, '
          f'$\sigma = {residuals.std():.1f}$, SMAPE = {residuals.abs().mean():.5f}')
plt.xlabel('RESIDUAL {percent}')
plt.ylabel('DENSITY')
plt.show()