## Introduction


Data preprocessing and most part of code is copied from https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model/notebook

This notebbok
* Explores and compares several models in sklean library
* Creates ensemble that has no overfitting risk since test data is only used for predictions

Updates
* V2: models' hyper-paramters are tuned to get the largest validation score

In [None]:
import pandas as pd
import numpy as np
import pickle
import itertools
import gc
import math
import matplotlib.pyplot as plt
import dateutil.easter as easter
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
from datetime import datetime, date, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.linear_model import LinearRegression, HuberRegressor, Ridge, Lasso, ElasticNet, BayesianRidge, PassiveAggressiveRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
np.random.seed(42)

## Data Preparation

In [None]:
original_train_df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
original_test_df = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')

gdp_df.set_index('year', inplace=True)

# The dates are read as strings and must be converted
for df in [original_train_df, original_test_df]:
    df['date'] = pd.to_datetime(df.date)
original_train_df.head(2)

In [None]:
# Feature engineering
def engineer(df):
    """Return a new dataframe with the engineered features"""
    
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]
        
    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
                           'wd4': df.date.dt.weekday == 4, # Friday
                           'wd56': df.date.dt.weekday >= 5, # Saturday and Sunday
                          })

    # One-hot encoding (no need to encode the last categories)
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        new_df[product] = df['product'] == product
        
    # Seasonal variations (Fourier series)
    # The three products have different seasonal patterns
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 3):
        new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Mug']
        new_df[f'hat_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Hat']
        new_df[f'hat_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Hat']

    return new_df

train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)

features = test_df.columns

for df in [train_df, test_df]:
    df[features] = df[features].astype(np.float32)
print(list(features))

In [None]:
# Feature engineering for holidays
def engineer_more(df):
    """Return a new dataframe with more engineered features"""
    new_df = engineer(df)

    # End of year
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d)
                                      for d in range(24, 32)}),
                        pd.DataFrame({f"n-dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in range(24, 32)}),
                        pd.DataFrame({f"f-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in range(1, 14)}),
                        pd.DataFrame({f"jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in range(1, 10)}),
                        pd.DataFrame({f"s-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in range(1, 15)})],
                       axis=1)
    
    # May
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) 
                                      for d in list(range(1, 10))}), #  + list(range(17, 25))
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in list(range(19, 26))})],
                       axis=1)
    
    # June and July
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"june{d}":
                                      (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in list(range(8, 14))}),
                        #pd.DataFrame({f"june{d}":
                        #              (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Norway')
                        #              for d in list(range(22, 31))}),
                        #pd.DataFrame({f"july{d}":
                        #              (df.date.dt.month == 7) & (df.date.dt.day == d) & (df.country == 'Norway')
                        #              for d in list(range(1, 3))})],
                       ],
                       axis=1)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"wed_june{d}": 
                                      (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                                      for d in list(range(-4, 6))})],
                       axis=1)
    
    # First Sunday of November
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"sun_nov{d}": 
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                                      for d in list(range(0, 9))})],
                       axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December)
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})],
                       axis=1)

    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"easter{d}": 
                                      (df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})],
                       axis=1)
    
    return new_df.astype(np.float32)

train_df = engineer_more(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer_more(original_test_df)

features = list(test_df.columns)
print(list(features))

# prepare single scaler
scaler = StandardScaler()
X = scaler.fit_transform(train_df[features])

Let's take 10% of train dataset that will be used for ensemble

In [None]:
mask = np.random.rand(len(train_df)) < 0.9
valid_df = train_df[~mask]
train_df = train_df[mask]
valid_df.head()

In [None]:
len((valid_df.columns))

## Models

I pick morels from sklearn libraries that solve linear regression tasks. 
You can find more information about them at https://scikit-learn.org/stable/supervised_learning.html

Note that I don't tune hyperparameters, so models perform better if more time was spent on them

In [None]:
linear = LinearRegression()                                              # simple one
huber = HuberRegressor()                                                 # uses more complicated loss that makes it robust 
ridge = Ridge()                                                          # avoids overfitting because of weight addition into loss function. So weights are kept small
lasso = Lasso(max_iter=200, alpha=0.2)                                   # has more complicated reguilarization
elastic_net = ElasticNet(max_iter=200)                                   # combination of lasso and ridge
bayesian = BayesianRidge()                                               # statistical analysis is understaken in this model. It is actually very interesting one

perceptron = MLPRegressor(hidden_layer_sizes=(128, 32, 16),              # several fully connected layers with hidden function. It may not be that useful in linearized data
                    max_iter=200,
                    activation='tanh', # tanh performs better than relu
                    solver='adam')
# someting more complex
par = PassiveAggressiveRegressor()                                       
gbr = GradientBoostingRegressor()

## Train

Train all of them independently on whole train ataset and see its score on the validation data

In [None]:
def smape_loss(y_true, y_pred):
    """SMAPE Loss"""
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

def train_model(model, scaler, X_tr, X_va=None):
    start_time = datetime.now()
    
    X = X_tr[features]
    y = X_tr.num_sold.values.reshape(-1, 1)
    
    X = scaler.transform(X)

    model.fit(X, np.log(y).ravel())
    
    if X_va is not None:
        X_v = X_va[features]
        X_v = scaler.transform(X_v)
        y_v = X_va.num_sold.values.reshape(-1, 1)


        y_v_pred = np.exp(model.predict(X_v)).reshape(-1, 1)
        
        smape_before_correction = np.mean(smape_loss(y_v, y_v_pred))

        smape = np.mean(smape_loss(y_v, y_v_pred))
        print(f"Model {model} | {str(datetime.now() - start_time)[-12:-7]}"
              f" | SMAPE: {smape:.5f}")
        
        plt.figure(figsize=(10, 10))
        plt.scatter(y_v, y_v_pred, s=1, color='r')
        plt.plot([plt.xlim()[0], plt.xlim()[1]], [plt.xlim()[0], plt.xlim()[1]], '--', color='k')
        plt.gca().set_aspect('equal')
        plt.xlabel('y_true')
        plt.ylabel('y_pred')
        plt.title('OOF Predictions')
        plt.show()
        
    return model

In [None]:
linear = train_model(linear, scaler, train_df, valid_df)

In [None]:
huber = train_model(huber, scaler, train_df, valid_df)

In [None]:
ridge = train_model(ridge, scaler, train_df, valid_df)

In [None]:
lasso = Lasso(max_iter=200, alpha=0.000002)
lasso = train_model(lasso, scaler, train_df, valid_df)

In [None]:
elastic_net = ElasticNet(max_iter=200, alpha=0.0000002)
elastic_net = train_model(elastic_net, scaler, train_df, valid_df)

In [None]:
bayesian = train_model(bayesian, scaler, train_df, valid_df)

In [None]:
perceptron = MLPRegressor(hidden_layer_sizes=(64, 16, 8),              # several fully connected layers with hidden function. It may not be that useful in linearized data
                    max_iter=100,
                    activation='tanh', 
                    solver='adam')
perceptron = train_model(perceptron, scaler, train_df, valid_df)

In [None]:
par = PassiveAggressiveRegressor(early_stopping=True)                                       

par = train_model(par, scaler, train_df, valid_df)

In [None]:
gbr = GradientBoostingRegressor(max_depth=7)
gbr = train_model(gbr, scaler, train_df, valid_df)

### Observations
Simple regressions perform well while multilayer perceptron and other complicated models have lower score. 
Complexity of this problem is too low for complex models to work. Probably it is better to break the problem by using more feature engineering or autoencoder to make data complex
Interesting that Lasso and Elastic Net failed completely. May be they are required to be tuned or the problem is that they want to acees the inter weights of model. I don't know

Update: Hyperparameters needed to be tuned well to achieve good results

## Ensemble
Now let's each model predict on validation dataset and find the corresponding weights on the final encemble using this data. 
Instead of manual set, I will use lasso regression to fiind the weighs. Restrictions are that regression is zero biased and weights are positive. Lasso type is used bevause only this model in library has the parameter of only positive weights

In [None]:
def predict(model, scaler, df):
    pred_list = []
    pred_list.append(np.exp(model.predict(scaler.transform(df[features]))))
    return pred_list

In [None]:
linear_valid = predict(linear, scaler, valid_df)
huber_valid = predict(huber, scaler, valid_df)
ridge_valid = predict(ridge, scaler, valid_df)
lasso_valid = predict(lasso, scaler, valid_df)
elastic_net_valid = predict(elastic_net, scaler, valid_df)
bayesian_valid = predict(bayesian, scaler, valid_df)
perceptron_valid = predict(perceptron, scaler, valid_df)
par_valid = predict(par, scaler, valid_df)
gbr_valid = predict(gbr, scaler, valid_df)

In [None]:
names = ['linear', 'huber', 'ridge', 'lasso', 'elastic_net', 'bayesian', 'perceptron', 'par', 'gbr', 'actual']
#names = ['linear', 'huber', 'ridge', 'lasso', 'elastic_net', 'bayesian', 'actual']
actual = valid_df['num_sold'].values
data = pd.DataFrame(list(zip(linear_valid[0], huber_valid[0], ridge_valid[0], lasso_valid[0], elastic_net_valid[0], bayesian_valid[0], perceptron_valid[0], par_valid[0], gbr_valid[0], actual)), columns=names)
#data = pd.DataFrame(list(zip(linear_valid[0], huber_valid[0], ridge_valid[0], lasso_valid[0], elastic_net_valid[0], bayesian_valid[0], actual)), columns=names)
data.head()

In [None]:
# train ensemble
ensemble = Lasso(fit_intercept=False, positive=True, alpha=0)
ensemble.fit(data[names[:-1]], data['actual'])
print('Weight of ensemble are', ensemble.coef_)
ensemble.coef_.sum()

PassiveAggressiveRegressor has nonzero weight. Means it is still good even if too complex

In [None]:
linear_test = predict(linear, scaler, test_df)
huber_test = predict(huber, scaler, test_df)
ridge_test = predict(ridge, scaler, test_df)
lasso_test = predict(lasso, scaler, test_df)
elastic_net_test = predict(elastic_net, scaler, test_df)
bayesian_test = predict(bayesian, scaler, test_df)
perceptron_test = predict(perceptron, scaler, test_df)
par_test = predict(par, scaler, test_df)
gbr_test = predict(gbr, scaler, test_df)


data = pd.DataFrame(list(zip(linear_test[0], huber_test[0], ridge_test[0], lasso_test[0], elastic_net_test[0], bayesian_test[0], perceptron_test[0], par_test[0], gbr_test[0])), columns=names[:-1])
#data = pd.DataFrame(list(zip(linear_test[0], huber_test[0], ridge_test[0], lasso_test[0], elastic_net_test[0], bayesian_test[0])), columns=names[:-1])
data.head()

In [None]:
# predict
test_prediction = ensemble.predict(data)

sub = original_test_df[['row_id']].copy()
sub['num_sold'] = test_prediction
sub.to_csv('submission.csv', index=False)
sub.head()

In [None]:
# Plot the distribution of the test predictions
plt.figure(figsize=(16,3))
plt.hist(train_df['num_sold'], bins=np.linspace(0, 3000, 201),
         density=True, label='Training')
plt.hist(sub['num_sold'], bins=np.linspace(0, 3000, 201),
         density=True, rwidth=0.5, label='Test predictions')
plt.xlabel('num_sold')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
sub_rounded = sub.copy()
sub_rounded['num_sold'] = (sub_rounded['num_sold']).round() # cheating
sub_rounded.to_csv('submission_rounded.csv', index=False)
sub_rounded.head()