# Many thanks to these amazing notebooks created by:

@ambrosm 
- the nice EDA for this dataset conveyed per https://www.kaggle.com/ambrosm/tpsjan22-01-eda-which-makes-sense
- the feature engineering routines invented per https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model

@gvyshnya
-- uses featurewiz to select 200 features out of ~625 features https://www.kaggle.com/gvyshnya/jan22-tpc-feature-importance-with-featurewiz/comments

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import pickle
import itertools
import gc
import math
from typing import Tuple, List, Dict
import matplotlib.dates as md
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.covariance import EllipticEnvelope
import dateutil.easter as easter


In [None]:
train_path = '../input/tabular-playground-series-jan-2022/train.csv'
test_path = '../input/tabular-playground-series-jan-2022/test.csv'
sample_submission_path = '../input/tabular-playground-series-jan-2022/sample_submission.csv'
original_train_df = pd.read_csv(train_path)
original_test_df = pd.read_csv(test_path)
subm = pd.read_csv(sample_submission_path)
print(original_train_df.shape, original_test_df.shape)

In [None]:
!pip install xlrd

In [None]:
!pip install featurewiz --ignore-installed --no-deps

In [None]:
import featurewiz as FW

In [None]:
# main flow
start_time = dt.datetime.now()
print("Started at ", start_time)

In [None]:
%%time
def smape_loss(y_true, y_pred):
    """SMAPE Loss"""
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

In [None]:
%%time
# The dates are read as strings and must be converted
for df in [original_train_df, original_test_df]:
    df['date'] = pd.to_datetime(df.date)
    df.set_index('date', inplace=True, drop=False)

In [None]:
%%time
# Feature engineering
def engineer(df):
    """Return a new dataframe with the engineered features"""
    new_df = pd.DataFrame({'year': df.date.dt.year, # This feature makes it possible to fit an annual growth rate
                           'dayofyear': df.date.dt.dayofyear,
                           'wd4': df.date.dt.weekday == 4, # Friday
                           'wd56': df.date.dt.weekday >= 5, # Saturday and Sunday
                           'dec29': (df.date.dt.month == 12) & (df.date.dt.day == 29), # end-of-year peak
                           'dec30': (df.date.dt.month == 12) & (df.date.dt.day == 30),
                          })

    # Easter
    new_df['easter_week'] = False
    for year in range(2015, 2020):
        easter_date = easter.easter(year)
        easter_diff = df.date - np.datetime64(easter_date)
        new_df['easter_week'] = new_df['easter_week'] | (easter_diff > np.timedelta64(0, "D")) & (easter_diff < np.timedelta64(8, "D"))
    
    # Growth is country-specific
    #for country in ['Finland', 'Norway', 'Sweden']:
    #    new_df[f"{country}_year"] = (df.country == country) * df.date.dt.year
        
    # One-hot encoding (no need to encode the last categories)
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Sticker']:
        new_df[product] = df['product'] == product
        
    # Seasonal variations (Fourier series)
    # The three products have different seasonal patterns
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 100): # 100
        new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Mug']
        new_df[f'sticker_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Sticker']
        new_df[f'sticker_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Sticker']

    return new_df

train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)
test_df.year = 2018 # no growth patch, see https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298318
#### George: You forgot to add two extra lines below for test_df. Without these two vars, train_df and test_df will be different.
test_df['date'] = original_test_df.date

features = test_df.columns
print(len(features))

In [None]:
print(train_df.shape, test_df.shape)

In [None]:
target = 'num_sold'

# Let's use Featurewiz to select the best features out of 607 features

In [None]:
train_best, test_best = FW.featurewiz(train_df, target, corr_limit=0.70, verbose=2, sep=',', 
        header=0, test_data=test_df,feature_engg='', category_encoders='', dask_xgboost_flag=True, nrows=train_df.shape[0])


# It look ~4 mins to select features in this dataset. We have 202 important features now

In [None]:
print(train_best.shape)
train_best.head()

In [None]:
print(test_best.shape)
test_best.head(2)

In [None]:
preds = test_best.columns.tolist()
len(preds)

# This simple LightGBM model works wonders since it is highly effective in many competitions

In [None]:
outputs = FW.simple_lightgbm_model(X_XGB=train_best[preds], Y_XGB=train_best[target],
                               X_XGB_test=test_best[preds], modeltype='Regression')

# It took less than 1 min to build a model with RMSE average = 207 over 5 folds

In [None]:
y_preds = outputs[0]
y_preds

In [None]:
subm[target] = y_preds
subm.head()

In [None]:
pd.DataFrame(y_preds).hist()

In [None]:
train_df[target].hist()

In [None]:
subm.to_csv('submission.csv',index=False)