### Averaging models method from my other notebooks: XGB+LGBM+LSTM provide best score. 
#### Here, I will try to find differences between LSTM, XGB, LGBM and averaging models.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')

lgbm = pd.read_csv('../input/tps-jan2022-lgbm-optuna/submission.csv')
xgbm = pd.read_csv('../input/tps-jan2022-xgb-optuna/submission.csv')
lstm = pd.read_csv('../input/tps-jan-2022-lstm/submission.csv')

avg1 = pd.read_csv('../input/tps-jan2022-blending/submission-avg-10.csv')
avg2 = pd.read_csv('../input/k/vladlee/tps-jan2022-tuning/submission-avg-10-rnd1.csv')
avg3 = pd.read_csv('../input/k/vladlee/tps-jan2022-tuning/submission-avg-10_rnd0.csv')

In [None]:
def process_dates(df):
    df.date = pd.to_datetime(df.date)
    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['week'] = df.date.dt.week
    df['weekday'] = df.date.dt.weekday
    df['dayofweek'] = df.date.dt.dayofweek
    df['dayofyear'] = df.date.dt.dayofyear
    df['day'] = df.date.dt.day
    return df

train = process_dates(train)
test = process_dates(test)

In [None]:
avg_num_sold = np.mean([lstm.num_sold, lgbm.num_sold, xgbm.num_sold, avg1.num_sold, avg2.num_sold, avg3.num_sold], axis=0)
test['num_sold'] = avg_num_sold
test['num_sold_lstm'] = lstm.num_sold
test['num_sold_lgbm'] = lgbm.num_sold
test['num_sold_xgbm'] = xgbm.num_sold
test['num_sold_avg1'] = avg1.num_sold
test['num_sold_avg2'] = avg2.num_sold
test['num_sold_avg3'] = avg3.num_sold

In [None]:
## this sub has 4.74 score

sub.num_sold = test.num_sold.astype(int)
sub.to_csv('submission.csv', index=False, float_format='%.6f')
sub.head(20)

### Display value count avg vs train by product

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 12))

g1 = sns.histplot(x="num_sold", palette="rainbow", hue='product',  multiple="stack", data=train[train.date.dt.year==2018], ax=ax1)
g1.set_title("Train - 2018")

g2 = sns.histplot(x="num_sold", palette="rainbow", hue='product',  multiple="stack", data=test[test.date.dt.year==2019], ax=ax2)
g2.set_title("Test AVG - 2018")

plt.show()

### Display avg vs model by country

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(16, 16))

g1 = sns.scatterplot(x="date", y="num_sold", palette="rainbow", hue='country', data=test[test.date.dt.year==2019], ax=ax1)
g1.set_title("AVG ALL")

g2 = sns.scatterplot(x="date", y="num_sold_xgbm", palette="rainbow", hue='country', data=test[test.date.dt.year==2019], ax=ax2)
g2.set_title("XGBM")

g3 = sns.scatterplot(x="date", y="num_sold_lstm", palette="rainbow", hue='country', data=test[test.date.dt.year==2019], ax=ax3)
g3.set_title("LSTM")

plt.show()

### Display avg diff (error) by model

In [None]:
f, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(16, 24))

test['num_sold-num_sold_avg1'] = test.num_sold - test.num_sold_avg1
test['num_sold-num_sold_xgbm'] = test.num_sold - test.num_sold_xgbm
test['num_sold-num_sold_lstm'] = test.num_sold - test.num_sold_lstm
test['num_sold-num_sold_lgbm'] = test.num_sold - test.num_sold_lgbm

g1 = sns.scatterplot(x="date", y="num_sold-num_sold_avg1", palette="rainbow", hue='month', data=test, ax=ax1)
g1.set_title("Test - 2019 - AVG err")

g2 = sns.scatterplot(x="date", y="num_sold-num_sold_xgbm", palette="rainbow", hue='month', data=test, ax=ax2)
g2.set_title("Train - 2019 - XGB err")

g3 = sns.scatterplot(x="date", y="num_sold-num_sold_lstm", palette="rainbow", hue='month', data=test, ax=ax3)
g3.set_title("Train - 2019 - LSTM err")

g4 = sns.scatterplot(x="date", y="num_sold-num_sold_lgbm", palette="rainbow", hue='month', data=test, ax=ax4)
g4.set_title("Train - 2019 - LGBM err")

plt.show()

### Display avg diff (error) vs model by product

In [None]:
f, ax = plt.subplots(2, 2, figsize=(20, 12))

g1 = sns.scatterplot(x=test.num_sold, y=test.num_sold_avg1, palette="rainbow", data=test, hue='product', ax=ax[0,0])
g1.set_title("Test - 2019 - AVG/AGV1 by product")

g2 = sns.scatterplot(x=test.num_sold, y=test.num_sold_xgbm, palette="rainbow", data=test, hue='product', ax=ax[0,1])
g2.set_title("Train - 2019 - AVG/XGB by product")

g3 = sns.scatterplot(x=test.num_sold, y=test.num_sold_lstm, palette="rainbow", data=test, hue='product', ax=ax[1,0])
g3.set_title("Train - 2019 - AVG/LSTM by product")

g4 = sns.scatterplot(x=test.num_sold, y=test.num_sold_lstm, palette="rainbow", data=test, hue='product', ax=ax[1,1])
g4.set_title("Train - 2019 - AVG/LGBM by product")

plt.show()

In [None]:
## display error by month
errs_by_month = np.zeros((3, 12))

for month in test.month.unique():
    err1 = mean_squared_error(test[test.month == month].num_sold, test[test.month == month].num_sold_avg1)
    err2 = mean_squared_error(test[test.month == month].num_sold, test[test.month == month].num_sold_xgbm)
    err3 = mean_squared_error(test[test.month == month].num_sold, test[test.month == month].num_sold_lstm)
    print(f'Month {month}, AVG1 RMSE: {err1:.2f}, XGB RMSE: {err2:.2f}, LSTM RMSE: {err3:.2f}')    
    errs_by_month[0][month-1] = err1
    errs_by_month[1][month-1] = err2
    errs_by_month[2][month-1] = err3

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))

g1 = sns.lineplot(x=test.month.unique(), y=errs_by_month[0], palette="rainbow", ax=ax1)
g1.set_title("2019 - err AVG/AGV1 by month")

g2 = sns.lineplot(x=test.month.unique(), y=errs_by_month[1], palette="rainbow", ax=ax2)
g2.set_title("2019 - err AVG/XGB by month")

g3 = sns.lineplot(x=test.month.unique(), y=errs_by_month[2], palette="rainbow", ax=ax3)
g3.set_title("2019 - err AVG/LSTM by month")

plt.show()