## Although I've made many improvements to this notebook and improved the score, much of the credit should go to:  

#### Credit to [@teckmengwong](https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series) for a fun notebook to work with.  and:

#### Feature engineering and Linear model based on excellent: https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model @ambrosm  and:

#### Hybrid model from Time series course: https://www.kaggle.com/learn/time-series




## Imports and Configuration ##

In [None]:
from scipy import stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns
from catboost import CatBoostRegressor

import ipywidgets as widgets
from learntools.time_series.style import *  # plot style settings

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from xgboost import XGBRegressor
import lightgbm as lgb
from pyearth import Earth

from datetime import date
import holidays
import calendar
import dateutil.easter as easter

from collections import defaultdict
le = defaultdict(LabelEncoder)

from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import ElasticNet, Lasso, Ridge, HuberRegressor, RidgeCV

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(12, 8))
plt.rc("axes",labelweight="bold",labelsize="large",titleweight="bold",titlesize=16,titlepad=10,)
plot_params = dict(color="0.75",style=".-",markeredgecolor="0.25",markerfacecolor="0.25",legend=False,)
%config InlineBackend.figure_format = 'retina'

import gc
import os
import math
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

In [None]:
PRODUCTION = True # True: For submission run. False: Fast trial run
RANDOM_STATE = 42
VERBOSE = 0

# Admin
ID = "row_id"            # Id id x X index
INPUT = "../input/tabular-playground-series-jan-2022"
FEATURE_ENGINEERING = True
PSEUDO_LABEL = True # PSEUDO are not ground true and will not help long term, only used for final push
BLEND = True

PSEUDO_DIR = "../input/pseuodo-labels/pseudo_labels_v0.csv"  

# time series data common new feature  
DATE = "date"
YEAR = "year"
MONTH = "month"
WEEK = "week"
DAY = "day"
DAYOFYEAR = "dayofyear"
DAYOFMONTH = "dayofMonth"
DAYOFWEEK = "dayofweek"
WEEKDAY = "weekday"


# Loss function SMAPE
​​i=1​∑​N​​w​i​​​​100​i=1​∑​N​​​(∣t​i​​∣+∣a​i​​∣)/2​​w​i​​∣a​i​​−t​i​​∣​​​​

In [None]:
# https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def smape_loss(y_true, y_pred):

    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)


# Data/Feature Engineering

In [None]:
def get_basic_ts_features(df):
    
    gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
    gdp_df.set_index('year', inplace=True)
    gdp_exponent = 1.2121103201489674 # see https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model for an explanation
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]
    
    df['gdp'] = np.log1p(df.apply(get_gdp, axis=1))
    
    for country in ['Finland', 'Norway']:
        df[country] = df.country == country
    for store in ['KaggleMart']:
        df[store] = df['store'] == store
    for product in ['Kaggle Mug', 'Kaggle Sticker']:
        df[product] = df['product'] == product
    
    df[MONTH] = df[DATE].dt.month
    # 4 seasons
    df['season'] = ((df[DATE].dt.month % 12 + 3) // 3).map({1:'DJF', 2: 'MAM', 3:'JJA', 4:'SON'})
    df['wd4'] = df[DATE].dt.weekday == 4
    df['wd56'] = df[DATE].dt.weekday >= 5
    
    # 21 days cyclic for lunar
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 32, 4):
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        df[f'Finland_sin{k}'] = df[f'sin{k}'] * df['Finland']
        df[f'Finland_cos{k}'] = df[f'cos{k}'] * df['Finland']
        df[f'Norway_sin{k}'] = df[f'sin{k}'] * df['Norway']
        df[f'Norway_cos{k}'] = df[f'cos{k}'] * df['Norway']
        df[f'store_sin{k}'] = df[f'sin{k}'] * df['KaggleMart']
        df[f'store_cos{k}'] = df[f'cos{k}'] * df['KaggleMart']
        df[f'mug_sin{k}'] = df[f'sin{k}'] * df['Kaggle Mug']
        df[f'mug_cos{k}'] = df[f'cos{k}'] * df['Kaggle Mug']
        df[f'sticker_sin{k}'] = df[f'sin{k}'] * df['Kaggle Sticker']
        df[f'sticker_cos{k}'] = df[f'cos{k}'] * df['Kaggle Sticker']
    
    # End of year
    # Dec
    for d in range(24, 32):
        df[f"dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d)
    for d in range(24, 32):
        df[f"n-dec{d}"] = (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
    # Jan
    for d in range(1, 14):
        df[f"f-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
    for d in range(1, 10):
        df[f"n-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
    for d in range(1, 15):
        df[f"s-jan{d}"] = (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
    # May
    for d in list(range(1, 10)):
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d)
    for d in list(range(19, 26)):
        df[f"may{d}"] = (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
    # June
    for d in list(range(8, 14)):
        df[f"june{d}"] = (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
    
    swed_rock_fest  = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-6')),2016: pd.Timestamp(('2016-06-11')),
                                           2017: pd.Timestamp(('2017-06-10')),2018: pd.Timestamp(('2018-06-10')),
                                           2019: pd.Timestamp(('2019-06-8'))})

    df = pd.concat([df, pd.DataFrame({f"swed_rock_fest{d}":
                                      (df.date - swed_rock_fest == np.timedelta64(d, "D")) & (df.country == 'Sweden')
                                      for d in list(range(-3, 3))})], axis=1)

    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    
    for d in list(range(-4, 6)):
        df[f"wed_june{d}"] = (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
        
    # First Sunday of November
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),2016: pd.Timestamp(('2016-11-6')),
                                        2017: pd.Timestamp(('2017-11-5')),2018: pd.Timestamp(('2018-11-4')),
                                        2019: pd.Timestamp(('2019-11-3'))})
    
    df = pd.concat([df, pd.DataFrame({f"sun_nov{d}":
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country == 'Norway')
                                      for d in list(range(0, 9))})], axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December)
    df = pd.concat([df, pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})], axis=1)
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df = pd.concat([df, pd.DataFrame({f"easter{d}":(df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})], axis=1)
    
    return df  


### Holiday generator

In [None]:
for ptr in holidays.Norway(years = [2019], observed=True).items():
    print(ptr)

In [None]:
def feature_engineer(df):
    df = get_basic_ts_features(df)
    return df

from pathlib import Path


def load_data():
    # Read data
    data_dir = Path(INPUT)
    df_train = pd.read_csv(data_dir / "train.csv", index_col=ID)
    df_test = pd.read_csv(data_dir / "test.csv", index_col=ID)
    column_y = df_train.columns.difference(
        df_test.columns)[0]  # column_y target_col label_col
    return df_train, df_test, column_y

In [None]:
from pathlib import Path

def load_data():
    # Read data
    data_dir = Path(INPUT)
    df_train = pd.read_csv(data_dir / "train.csv", parse_dates=[DATE],
                    usecols=['date', 'country', 'store', 'product', 'num_sold'],
                    dtype={'country': 'category','store': 'category','product': 'category', 'num_sold': 'float32',},
                    infer_datetime_format=True,)
    df_test = pd.read_csv(data_dir / "test.csv", index_col=ID, parse_dates=[DATE])
    column_y = df_train.columns.difference(df_test.columns)[0]  # column_y target_col label_col
    df_train[DATE] = pd.to_datetime(df_train[DATE])
    df_test[DATE] = pd.to_datetime(df_test[DATE])
    
    return df_train, df_test, column_y


In [None]:
def process_data(df_train, df_test):
    
    if FEATURE_ENGINEERING:
        df_train = feature_engineer(df_train)
        df_test = feature_engineer(df_test)

    return df_train, df_test

# Load Data #

And now we can call the data loader and get the processed data splits:

In [None]:
%%time
train_df, test_df, column_y = load_data()

In [None]:
%%time
train_df, test_df = process_data(train_df, test_df)

# Data Pipeline

In [None]:
train_data = train_df.copy()
train_data[DATE] = train_df.date.dt.to_period('D')
test_data = test_df.copy()
test_data[DATE] = test_df.date.dt.to_period('D')

## Pseudolabeling

In [None]:
df_pseudolabels = pd.read_csv(PSEUDO_DIR, index_col=ID)
df_pseudolabels[DATE] = pd.to_datetime(test_df[DATE])
df_pseudolabels.to_csv("pseudo_labels_v0.csv", index=True)
test_data[column_y] = df_pseudolabels[column_y].astype(np.float32)
train_data = pd.concat([train_data, test_data], axis=0)
train_df = pd.concat([train_df, test_data], axis=0)

In [None]:
X = train_data.set_index([DATE]).sort_index()
X_test = test_data.set_index([DATE]).sort_index()

## Removal of 2016 test

In [None]:
train_data = train_data.set_index(['date', 'country', 'store', 'product']).sort_index()

In [None]:
kaggle_sales_2015 = (train_data.groupby(['country', 'store', 'product', 'date']).mean()
                     .unstack(['country', 'store', 'product']).loc['2015'])

In [None]:
kaggle_sales_2016 = (train_data.groupby(['country', 'store', 'product', 'date']).mean()
                     .unstack(['country', 'store', 'product']).loc['2016'])

In [None]:
kaggle_sales_2017 = (train_data.groupby(['country', 'store', 'product', 'date']).mean()
                     .unstack(['country', 'store', 'product']).loc['2017'])

In [None]:
kaggle_sales_2018 = (train_data.groupby(['country', 'store', 'product', 'date']).mean()
                     .unstack(['country', 'store', 'product']).loc['2018'])

In [None]:
frames = [kaggle_sales_2015, kaggle_sales_2016, kaggle_sales_2017, kaggle_sales_2018]
kaggle_sales = pd.concat(frames)

In [None]:
kaggle_sales

In [None]:
gc.collect()

In [None]:
# Check NA
missing_val = X.isnull().sum()
print(missing_val[missing_val > 0])

In [None]:
train_data.groupby(column_y).apply(lambda s: s.sample(min(len(s), 5)))

In [None]:
train_data['month']

In [None]:
fig_dims = (50,30)
ax = kaggle_sales.num_sold.plot(title='Sales Trends', figsize=fig_dims)
_ = ax.set(ylabel="Numbers sold")

In [None]:
# Plot all num_sold_true and num_sold_pred (five years) for one country-store-product combination
def plot_five_years_combination(engineer, country='Norway', store='KaggleMart', product='Kaggle Hat'):
    demo_df = pd.DataFrame({'row_id': 0,'date': pd.date_range('2015-01-01', '2019-12-31', freq='D'),
                            'country': country,'store': store,'product': product})
    demo_df.set_index('date', inplace=True, drop=False)
    demo_df = engineer(demo_df)
    demo_df[GROUP_INDEX] = demo_df[GROUP_INDEX].apply(lambda x: le[x.name].transform(x))
    demo_df['num_sold'] = np.expm1(model.predict(preproc.transform(demo_df[features])))
    train_subset = train_df[(train_df.country == country) & (train_df.store == store) & (train_df['product'] == product)]
    plt.figure(figsize=(24, 8))
    plt.plot(demo_df[DATE], demo_df.num_sold, label='prediction', alpha=0.5)
    plt.scatter(train_subset[DATE], train_subset.num_sold, label='true', alpha=0.5, color='red', s=2)
    plt.grid(True)
    plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
    plt.legend()
    plt.title(f'{country} {store} {product} Predictions and true for five years')
    plt.show()
    
    return demo_df['num_sold']

In [None]:
def find_min_SMAPE(y_true, y_predict):
    loss_correction = 1
    scores = []
    # float step
    for WEIGHT in np.arange(0.97, 1.02, 0.0001):
        y_hat = y_predict.copy()
        y_hat *= WEIGHT
        scores.append(np.array([WEIGHT, np.mean(smape_loss(y_true, y_hat))]))
        
    scores = np.vstack(scores)
    min_SMAPE = np.min(scores[:,1])
    print(f'min SMAPE {min_SMAPE:.5f}')
    for x in scores:
        if x[1] == min_SMAPE:
            loss_correction = x[0]
            print(f'loss_correction: {x[0]:.5f}')
            
    plt.figure(figsize=(5, 3))
    plt.plot(scores[:,0],scores[:,1])
    plt.scatter([loss_correction], [min_SMAPE], color='g')
    plt.ylabel(f'SMAPE')
    plt.xlabel(f'loss_correction: {loss_correction:.5f}')
    plt.legend()
    plt.title(f'min SMAPE:{min_SMAPE:.5f} scaling')
    plt.show()
    
    return loss_correction

In [None]:
def plot_true_vs_prediction(df_true, df_hat):
    plt.figure(figsize=(20, 13))
    plt.scatter(np.arange(len(df_hat)), np.log1p(df_hat), label='prediction', alpha=0.5, color='blue', s=3) 
    plt.scatter(np.arange(len(df_true)), np.log1p(df_true), label='Pseudo/true', alpha=0.5, color='red', s=7) 
    plt.legend()
    plt.title(f'Predictions VS Pseudo-label {column_y} (LOG)') #{df_true.index[0]} - {df_true.index[-1]}
    plt.show()

In [None]:
def plot_residuals(y_residuals):
    plt.figure(figsize=(13, 3))
    plt.scatter(np.arange(len(y_residuals)), np.log1p(y_residuals), label='residuals', alpha=0.1, color='blue', s=5)
    plt.legend()
    plt.title(f'Linear Model residuals {column_y} (LOG)') #{df_true.index[0]} - {df_true.index[-1]}
    plt.tight_layout()
    plt.show()

In [None]:
def plot_oof(y_true, y_predict):
    plt.figure(figsize=(5, 5))
    plt.scatter(y_true, y_predict, s=1, color='r', alpha=0.5)
    plt.plot([plt.xlim()[0], plt.xlim()[1]], [plt.xlim()[0], plt.xlim()[1]], '--', color='k')
    plt.gca().set_aspect('equal')
    plt.xlabel('y_true')
    plt.ylabel('y_pred')
    plt.title('OOF Predictions')
    plt.show()

In [None]:
def evaluate_SMAPE(y_va, y_va_pred):
    loss_correction = 1
    
    # Evaluation: Execution time and SMAPE
    smape_before_correction = np.mean(smape_loss(y_va, y_va_pred))
    smape = np.mean(smape_loss(y_va, y_va_pred))
    loss_correction = find_min_SMAPE(y_va, y_va_pred)
    y_va_pred *= loss_correction
    print(f"SMAPE (before correction: {smape_before_correction:.5f})")
    print(f'Min SMAPE: {np.mean(smape_loss(y_va, y_va_pred))}')
    
    return loss_correction

## Data preprocessing X_2 X_test y

In [None]:
GROUP_INDEX = ['country', 'store', 'product', 'month', 'season']

# Target series
y = X.loc[:, column_y]

# X_1: Features for Linear Regression
fourier = CalendarFourier(freq="A", order=10)  # 10 sin/cos pairs for "A"nnual seasonality

dp = DeterministicProcess(index=X.index,constant=True,order=1,seasonal=True,additional_terms=[fourier],drop=True,)

X_1 = dp.in_sample()  # create features for dates in tunnel.index

# X_2: Features for XGBoo
X_2 = X.drop(column_y, axis=1)

# Encoding the variable
X_2[GROUP_INDEX] = X_2[GROUP_INDEX].apply(lambda x: le[x.name].fit_transform(x))

# Using the dictionary to label future data
X_test[GROUP_INDEX] = X_test[GROUP_INDEX].apply(lambda x: le[x.name].transform(x))

In [None]:
features = X_2.columns

In [None]:
if PSEUDO_LABEL:
    TRAIN_END_DATE = "2019-12-31"
    VALID_START_DATE = "2015-01-01"
    VALID_END_DATE = "2018-12-31"
else:
    if PRODUCTION:
        TRAIN_END_DATE = "2018-12-31"
    else:
        TRAIN_END_DATE = "2017-12-31"
    VALID_START_DATE = "2018-01-01"
    VALID_END_DATE = "2018-12-31"

y_train, y_valid = y[:TRAIN_END_DATE], y[VALID_START_DATE:VALID_END_DATE]
X1_train, X1_valid = X_1[:TRAIN_END_DATE], X_1[VALID_START_DATE:VALID_END_DATE]
X2_train, X2_valid = X_2.loc[:TRAIN_END_DATE], X_2.loc[VALID_START_DATE:VALID_END_DATE]

In [None]:
# You'll add fit and predict methods to this minimal class
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class BoostedHybrid(BaseEstimator, RegressorMixin):
    def __init__(self, model_1, model_2, scaler):
        self.model_1 = model_1
        self.model_2 = model_2
        self.scaler = scaler
        self.y_columns = None  # store column names from fit method
    
    def fit(self, X, y): #, X_1_valid, y_valid
        
        X, y = check_X_y(X, y, accept_sparse=True)
        # Train model_1
        self.model_1.fit(X, y)

        # Make predictions
        y_fit = self.model_1.predict(X)
        # Compute residuals
        y_resid = y - y_fit

        # Train model_2 on residuals , eval_set=[(X_1_valid, y_valid_resid)]
        self.model_2.fit(X, y_resid)
        
        # Model2 prediction
        y_fit2 = self.model_2.predict(X)
        # Compute noise
        y_resid2 = y_resid - y_fit2
        
        # Save data for question checking
        self.y = y
        self.y_fit = y_fit
        self.y_resid = y_resid
        self.y_fit2 = y_fit2
        self.y_resid2 = y_resid2

        self.is_fitted_ = True
        
        return self


    def predict(self, X):
        
        X = check_array(X, accept_sparse=True)
        check_is_fitted(self, 'is_fitted_')
        # Predict with model_1
        y_predict = self.model_1.predict(X)
        
        # Add model_2 predictions to model_1 predictions
        y_predict += self.model_2.predict(X)

        return y_predict



# Training

## Using StandardScaler

In [None]:
preproc = StandardScaler()

In [None]:
def model_fit_eval(hybrid_model, X_train, y_train, X_valid, y_valid, scaler, loss_correction):
    test_pred_list = []
    
    # Boosted Hybrid
    hybrid_model.fit(X_train, y_train) #, X_valid, y_valid
    y_va_pred = hybrid_model.predict(X_valid)
        
    ###### Preprocess the validation data
    y_va = np.expm1(y_valid.copy())
    
    # Inference for validation
    y_va_pred = np.expm1(hybrid_model.predict(X_valid))
    loss_correction = evaluate_SMAPE(y_va, y_va_pred)
    
    ###### Visualize and evual
    plot_oof(y_va, y_va_pred)
    plot_true_vs_prediction(y_va, y_va_pred)
    
    ###### Validate against 2019 PSEU #######
    loss_correction = 1
    
    ###### Preprocess the validation data
    y_va = df_pseudolabels[column_y].values.reshape(-1, 1)
    
    # Inference test 2019 for validation
    y_va_pred = np.expm1(hybrid_model.predict(scaler.transform(X_test[features])))
    
    # Evaluation: Execution time and SMAPE
    smape_before_correction = np.mean(smape_loss(y_va, y_va_pred.reshape(-1, 1)))
    smape = np.mean(smape_loss(y_va, y_va_pred.reshape(-1, 1)))
    print(f'***********Test Data*****************')
    loss_correction = find_min_SMAPE(y_va, y_va_pred.reshape(-1, 1))
    
    ### Mean test prediction ###
    test_pred_list.append(y_va_pred)

    print(f'SMAPE (before correction: {smape_before_correction:.5f})')
    print(f'Min SMAPE: {np.mean(smape_loss(y_va, y_va_pred.reshape(-1, 1)*loss_correction))}')
    
    return hybrid_model, test_pred_list, loss_correction

In [None]:
xgb_params = dict(booster= 'gbtree',
                  tree_method = 'exact',
                  objective='reg:pseudohubererror',
                  n_jobs = -1,
                  max_depth=5,
                  learning_rate=0.5406089095129346,
                  n_estimators=5918,
                  min_child_weight=3,
                  colsample_bytree=0.2595406744619732,
                  # subsample=trial.suggest_float("subsample", 0.3, .8),
                  reg_alpha=97.39174536138904,
                  reg_lambda=0.5348869112742457,
                  # colsample_bylevel=trial.suggest_float("colsample_bylevel",.3,.5),
                  gamma=0.1024158842570319,
                  # max_delta_step=trial.suggest_float("max_delta_step",0,1),
                  num_parallel_tree=1,
                  random_state=35,
                 )

lgbm_params = {
              'learning_rate':0.025292895772398984,
              "objective": "regression",
              "metric": "rmse",
              'boosting_type': "gbdt",
              'verbosity': -1,
              'n_jobs': -1, 
              'seed': 21,
              'reg_alpha': 0.0029751624135773416,
              'reg_lambda': 0.650014120724397,
              'lambda_l1': 1.1096023419303558, 
              'lambda_l2': 1.996527963987735, 
              'num_leaves': 109, 
               # 'feature_fraction': 0.6259927292757151, 
               # 'bagging_fraction': 0.9782210574588895, 
               # 'bagging_freq': 1, 
              'n_estimators': 2606, 
              'max_depth': 1, 
              'max_bin': 244, 
              'min_data_in_leaf': 366,
              'random_state' : RANDOM_STATE,
              }


In [None]:
# xgb_params = dict(booster= 'gbtree',
#                   tree_method = 'exact',
#                   objective='reg:pseudohubererror',
#                   n_jobs = -1,
#                   max_depth=trial.suggest_int("max_depth",2,5),
#                   learning_rate=trial.suggest_float('learning_rate',.001,1),
#                   n_estimators=trial.suggest_int('n_estimators',1000,10000),
#                   min_child_weight=trial.suggest_int('min_child_weight',1,3),
#                   colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
#                   # subsample=trial.suggest_float("subsample", 0.3, .8),
#                   reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
#                   reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
#                   # colsample_bylevel=trial.suggest_float("colsample_bylevel",.3,.5),
#                   gamma=trial.suggest_float("gamma", 0,1),
#                   # max_delta_step=trial.suggest_float("max_delta_step",0,1),
#                   num_parallel_tree=1,
#                   random_state=35,
#                   )

In [None]:
# import optuna
# import lightgbm as lgb
# import sklearn.datasets
# import sklearn.metrics

# LOSS_CORRECTION = 1
# estimator_stack = []

# def objective(trial):
#     LOSS_CORRECTION = 1
#     xgb_params = dict(booster= 'gbtree',
#                       tree_method = 'exact',
#                       objective='reg:pseudohubererror',
#                       n_jobs = -1,
#                       max_depth=trial.suggest_int("max_depth",2,5),
#                       learning_rate=trial.suggest_float('learning_rate',.001,1),
#                       n_estimators=trial.suggest_int('n_estimators',1000,10000),
#                       min_child_weight=trial.suggest_int('min_child_weight',1,3),
#                       colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
#                       # subsample=trial.suggest_float("subsample", 0.3, .8),
#                       reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
#                       reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
#                       # colsample_bylevel=trial.suggest_float("colsample_bylevel",.3,.5),
#                       gamma=trial.suggest_float("gamma", 0,1),
#                       # max_delta_step=trial.suggest_float("max_delta_step",0,1),
#                       num_parallel_tree=1,
#                       random_state=35,
#                      )
    
#     X2 = preproc.fit_transform(X2_train[features])
#     model = BoostedHybrid(model_1 = Ridge(alpha=.85),model_2 = XGBRegressor(**xgb_params),scaler = preproc)
#     model, test_pred_list, LOSS_CORRECTION = model_fit_eval(model, X2, np.log1p(y_train),preproc.transform(X2_valid[features]),
#                                                             np.log1p(y_valid),preproc, LOSS_CORRECTION)
    
#     return LOSS_CORRECTION  

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=500)
# xgb_params = study.best_params
# print('best params',xgb_params)


# def objective(trial):
#     LOSS_CORRECTION=1
#     lgbm_params = {
#                   'learning_rate': trial.suggest_float('learning_rate', .001, 1.0),
#                   "objective": "regression",
#                   "metric": "rmse",
#                   'boosting_type': "gbdt",
#                   'verbosity': -1,
#                   'n_jobs': -1, 
#                   'seed': 21,
#                   'lambda_l1': trial.suggest_float('lambda_l1',.1,2.0),
#                   'lambda_l2': trial.suggest_float('lambda_l2', .1,2.0),
#                   'num_leaves': trial.suggest_int('num_leaves', 100, 500),
#                    #'feature_fraction': trial.suggest_float('feature_fraction',.01,1.0),
#                    # 'bagging_fraction': trial.suggest_float('bagging_fraction',.01,1.0),
#                    # 'bagging_freq': trial.suggest_int('bagging_freq',1,2), 
#                   'n_estimators': trial.suggest_int('n_estimators',100,5000), 
#                   'max_depth': trial.suggest_int('max_depth', 1,7), 
#                   'max_bin': trial.suggest_int('max_bin', 10,500),
#                   # 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',300,5000),
#                   }
    
    
#     X2 = preproc.fit_transform(X2_train[features])
#     model = BoostedHybrid(model_1 = Ridge(alpha=.85),model_2 = lgb.LGBMRegressor(**lgbm_params),scaler = preproc)
#     model, test_pred_list, LOSS_CORRECTION = model_fit_eval(model, X2, np.log1p(y_train),preproc.transform(X2_valid[features]),
#                                                             np.log1p(y_valid),preproc, LOSS_CORRECTION)
    
#     return LOSS_CORRECTION  
    
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=500)
# lgbm_params = study.best_params
# print('best params',lgbm_params)


###########################################################################################
# ***********Test Data*****************
# min SMAPE 0.57772
# loss_correction: 0.99910
# SMAPE (before correction: 0.58343)
# Min SMAPE: 0.5777192140317523
# [LightGBM] [Warning] lambda_l2 is set=1.996527963987735, reg_lambda=0.0 will be ignored. Current value: lambda_l2=1.996527963987735
# [LightGBM] [Warning] lambda_l1 is set=1.1096023419303558, reg_alpha=0.0 will be ignored. Current value: lambda_l1=1.1096023419303558
# min SMAPE 4.19017
# loss_correction: 1.00080
# SMAPE (before correction: 4.19074)
# Min SMAPE: 4.190172283130202
# [I 2022-01-22 22:00:03,571] Trial 73 finished with value: 0.9984999999999968 and parameters: {'learning_rate': 0.025292895772398984, 'lambda_l1': 1.1096023419303558, 'lambda_l2': 1.996527963987735, 'num_leaves': 109, 'n_estimators': 2606, 'max_depth': 1, 'max_bin': 244}. Best is trial 73 with value: 0.9984999999999968.
# ***********Test Data*****************
# min SMAPE 0.63428
# loss_correction: 0.99850
# SMAPE (before correction: 0.64643)
# Min SMAPE: 0.6342847324685896
# [LightGBM] [Warning] lambda_l2 is set=1.9365125171332647, reg_lambda=0.0 will be ignored. Current value: lambda_l2=1.9365125171332647
# [LightGBM] [Warning] lambda_l1 is set=1.1349580383858455, reg_alpha=0.0 will be ignored. Current value: lambda_l1=1.1349580383858455
# min SMAPE 4.15496
# loss_correction: 1.00090
# SMAPE (before correction: 4.15572)
# Min SMAPE: 4.154962358508988
#################################################

# param1 = {'loss_function': 'MultiRMSE','eval_metric': 'MultiRMSE','n_estimators': 1000,'od_type' : 'Iter',
#             'od_wait' : 20,'random_state': RANDOM_STATE,'verbose': VERBOSE}

# # Try different combinations of the algorithms above KNeighborsRegressor
# models_1 = [Earth(verbose=VERBOSE), Ridge(alpha=.85), HuberRegressor(epsilon=1.20, max_iter=500),
#             MLPRegressor(  hidden_layer_sizes=(256, 128),learning_rate_init=0.01,early_stopping=True,
#                             random_state=RANDOM_STATE ),]

# models_2 = [XGBRegressor(objective='reg:pseudohubererror', tree_method='hist', n_estimators=1000),
#             lgb.LGBMRegressor(objective='regression', n_estimators=1000, random_state=RANDOM_STATE),
#             CatBoostRegressor(**param1),]

# for model_1 in models_1:
#     for model_2 in models_2:
#         model1_name = type(model_1).__name__
#         model2_name = type(model_2).__name__
#         hybrid_model = BoostedHybrid(model_1 = model_1,model_2 = model_2,scaler = preproc)
#         print(f'******************Stacking {model1_name:>15} with {model2_name:<18}*************************')
#         estimator_stack.append((f'model_{model1_name}_{model2_name}', hybrid_model))
        
# 
# model = StackingRegressor(estimators=estimator_stack, final_estimator=RidgeCV(alphas=[.7,.8,.85,.9,1,1.1,1.2]),
#                           n_jobs=-1, verbose=VERBOSE)

# model, test_pred_list, LOSS_CORRECTION = model_fit_eval(model, X2, np.log1p(y_train),preproc.transform(X2_valid[features]),
#                                                         np.log1p(y_valid),preproc, LOSS_CORRECTION)

# model = XGBRegressor(**xgb_params)



In [None]:
%%time
LOSS_CORRECTION = 1
estimator_stack = []

param1 = {'loss_function': 'MultiRMSE','eval_metric': 'MultiRMSE','n_estimators': 1000,'od_type' : 'Iter',
          'od_wait' : 20,'random_state': RANDOM_STATE,'verbose': VERBOSE}

# Try different combinations of the algorithms above KNeighborsRegressor
models_1 = [Earth(verbose=VERBOSE), Ridge(alpha=.85), HuberRegressor(epsilon=1.20, max_iter=500),
            MLPRegressor(hidden_layer_sizes=(256, 128),learning_rate_init=0.01,early_stopping=True,
                         random_state=RANDOM_STATE ),]

models_2 = [XGBRegressor(**xgb_params),lgb.LGBMRegressor(**lgbm_params),CatBoostRegressor(**param1),]

for model_1 in models_1:
    for model_2 in models_2:
        model1_name = type(model_1).__name__
        model2_name = type(model_2).__name__
        hybrid_model = BoostedHybrid(model_1 = model_1,model_2 = model_2,scaler = preproc)
        print(f'******************Stacking {model1_name:>15} with {model2_name:<18}*************************')
        estimator_stack.append((f'model_{model1_name}_{model2_name}', hybrid_model))
        
X2 = preproc.fit_transform(X2_train[features])

model = StackingRegressor(estimators=estimator_stack,final_estimator=RidgeCV(), n_jobs=-1, verbose=VERBOSE)

model,test_pred_list,LOSS_CORRECTION=model_fit_eval(model,X2,np.log1p(y_train),preproc.transform(X2_valid[features]),
                                                    np.log1p(y_valid),preproc,LOSS_CORRECTION)

# Inference validation

In [None]:
for country in np.unique(train_df['country']):
    for product in np.unique(train_df['product']):
        for store in np.unique(train_df['store']):
            y_fit = plot_five_years_combination(feature_engineer, country=country, product=product, store=store)

# Inference year 2019 test data

In [None]:
y_pred = sum(test_pred_list) / len(test_pred_list) #model.predict(X_test[features])

In [None]:
%%time
LOSS_CORRECTION = 1

###### Preprocess the validation data
y_va = df_pseudolabels[column_y].values.reshape(-1, 1)

# Inference for validation
y_va_pred = y_pred.copy().reshape(-1, 1) #model.predict(X_test[features])

# Evaluation: Execution time and SMAPE
smape_before_correction = np.mean(smape_loss(y_va, y_va_pred))
smape = np.mean(smape_loss(y_va, y_va_pred))
LOSS_CORRECTION = find_min_SMAPE(y_va, y_va_pred)
y_va_pred *= LOSS_CORRECTION

print(f" SMAPE: {smape:.5f} (before correction: {smape_before_correction:.5f})")
print(np.mean(smape_loss(y_va, y_va_pred)))

# [I 2022-01-21 19:08:22,256] Trial 21 finished with value: 0.996799999999997 and parameters: {'max_depth': 5, 
# 'learning_rate': 0.5406089095129346, 'n_estimators': 5918, 'min_child_weight': 3, 'colsample_bytree': 
# 0.2595406744619732, 'reg_alpha': 97.39174536138904, 'reg_lambda': 0.5348869112742457, 'gamma': 
# 0.1024158842570319}. Best is trial 12 with value: 0.996799999999997.
# ***********Test Data*****************
# min SMAPE 0.78355
# loss_correction: 0.99680
# SMAPE (before correction: 0.82951)
# Min SMAPE: 0.7835522368396695
# min SMAPE 4.13588
# loss_correction: 1.00080
# SMAPE (before correction: 4.13639)
# Min SMAPE: 4.1358802070879515

In [None]:
plot_oof(y_va, y_va_pred)
plot_true_vs_prediction(y_va, y_va_pred)
plot_residuals(model.estimators_[0].y_resid)
plot_residuals(model.estimators_[0].y_resid2)

# Submission
Once you're satisfied with everything, it's time to create your final predictions! This cell will:

- use the best trained model to make predictions from the test set
- save the predictions to a CSV file


In [None]:
from math import ceil, floor, sqrt
# from https://www.kaggle.com/fergusfindley/ensembling-and-rounding-techniques-comparison
def geometric_round(arr):
    result_array = arr
    result_array = np.where(result_array < np.sqrt(np.floor(arr)*np.ceil(arr)), np.floor(arr), result_array)
    result_array = np.where(result_array >= np.sqrt(np.floor(arr)*np.ceil(arr)), np.ceil(arr), result_array)

    return result_array

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')

In [None]:
# Inference for test
test_prediction_list = []
test_prediction_list.append(y_pred) # * LOSS_CORRECTION)
df_pseudolabels1 = pd.DataFrame()
df_pseudolabels2 = pd.DataFrame()
df_pseudolabels3 = pd.DataFrame()

if BLEND:
    
    test_prediction_list.append(df_pseudolabels[column_y].values) #blender 1
    
    df_pseudolabels1 = pd.read_csv('../input/tell-me-the-magic-number/submission.csv', index_col=ID)    
    
    test_prediction_list.append(df_pseudolabels1[column_y].values) #blender 2
    
    df_pseudolabels2 = pd.read_csv('../input/tps-01-2022/submission.csv', index_col=ID)
    
    test_prediction_list.append(df_pseudolabels2[column_y].values)
    
    df_pseudolabels3 = pd.read_csv('../input/tpsjan22-03-linear-model/submission_linear_model.csv', index_col=ID)
    
    test_prediction_list.append(df_pseudolabels3[column_y].values)

test_prediction_list = np.median(test_prediction_list, axis=0) 


if len(test_prediction_list) > 0:
    # Create the submission file
    submission = pd.DataFrame(data=np.zeros((sub.shape[0],2)),index = sub.index.tolist(),columns=[ID,column_y])
    submission[ID] = sub[ID]
    submission[column_y] = test_prediction_list
    
    #https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/299162
    submission[column_y] = geometric_round(submission[column_y]).astype(int) 
    
    submission.to_csv('submission.csv', index=False)

    # Plot the distribution of the test predictions
    plt.figure(figsize=(16,3))
    plt.hist(train_df[column_y], bins=np.linspace(0, 3000, 201),density=True, label='Training')
    plt.hist(submission[column_y], bins=np.linspace(0, 3000, 201),density=True, rwidth=0.5, label='Test predictions')
    plt.xlabel(column_y)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

In [None]:
display(submission.head(30))
display(submission.tail(30))