In [None]:
import pandas as pd
from pycaret.regression import *
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline, make_union, FeatureUnion
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.linear_model import Ridge, LinearRegression, Lasso, ElasticNet
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn import set_config; set_config(display='diagram')
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor

## DateTime Features

In [None]:
def transform_data(X):
    X.date = pd.to_datetime(X.date)
    X['month'] = X.date.dt.month
    X['hour'] = X.date.dt.hour
    X['year'] = X.date.dt.year
    X['day'] = X.date.dt.day
    X['day_of_year'] = X['date'].dt.dayofyear
    X['week_of_year'] = X['date'].dt.weekofyear
    X['day_of_month'] = X['date'].dt.days_in_month
    X['day_of_week'] = X['date'].dt.dayofweek
    X['weekday'] = X['date'].dt.weekday
    X = X.drop(['date'], axis = 1)
    return X

## Train and test data

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv', index_col = 'row_id')
X_test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv', index_col = 'row_id')

In [None]:
X = df.drop(['num_sold'], axis = 1)
X_train = transform_data(X)
X_test = transform_data(X_test)
y_train = df.num_sold

In [None]:
train = pd.concat([X_train, y_train], axis=1)

## Symmetric Mean Absolute Percentage Error (SMAPE)

In [None]:
# Credit to https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

## Choosing Best Model using Pycaret

In [None]:
reg = setup(data = train,
            train_size = 0.7,
            target = 'num_sold',
            data_split_shuffle = False,
            remove_multicollinearity = True,
            imputation_type= 'iterative',
            normalize_method= 'maxabs',
            fold_strategy='stratifiedkfold',
            fold=10,
            numeric_imputation='median',
            create_clusters = True,
            use_gpu = True,
            silent = True,
            n_jobs = -1)

In [None]:
add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better = False)

In [None]:
best_models = compare_models(sort = 'SMAPE', n_select = 4)

## Blend the top 3 best models

In [None]:
blend_model = blend_models(estimator_list = best_models[:3], optimize = 'SMAPE')

In [None]:
blend_model.get_params()

## Fine Tuning

In [None]:
params_3 = {
          'catboost__loss_function': ['RMSE', 'MAE', 'MAPE'],
          'catboost__border_count': [254, 260, 270, 140],
          'lightgbm__min_child_samples': [20, 30, 40, 50],
          'lightgbm__num_leaves': [31, 71, 101, 131],
          'lightgbm__boosting_type': ['gbdt', 'dart', 'goss'],
          'lightgbm__n_estimators': [100, 115, 120, 130, 135],
          'lightgbm__reg_alpha': [0.0, 0.001, 0.01, 0.1],
          'lightgbm__reg_lambda': [0.0, 0.001, 0.01, 0.1],
          'xgboost__n_estimators': [100, 115, 120, 130, 135],
          'xgboost__reg_alpha': [0.0, 0.001, 0.01, 0.1],
          'xgboost__learning_rate': [0.300000012, 0.1, 0.01, 0.001]
          }

blend_model_tuned = tune_model(blend_model, n_iter=50, optimize = 'SMAPE')

In [None]:
blend_model.get_params()

## Finalize Model

In [None]:
model_final = finalize_model(blend_model)

In [None]:
predict_model(model_final)

## Predictions

In [None]:
final_data = predict_model(model_final, data=X_test)
final_data = final_data.reset_index()
final_data = final_data.rename({'index':'row_id', 'Label':'num_sold'}, axis = 1)
final_data['num_sold']

In [None]:
X_test.reset_index()['row_id']

In [None]:
final_df = pd.concat([X_test.reset_index()['row_id'], final_data['num_sold']], axis=1)
final_df = final_df.set_index('row_id')
final_df.to_csv('submission_4.csv')

final_df