In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

import array

from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from learntools.time_series.style import *  # plot style settings
from learntools.time_series.utils import plot_periodogram, seasonal_plot

**Defining preprocessing function to extract/make features**

In [None]:
# check seasonality of every sub group (country, product, store)
def plot_seasonality(df):
      
    splitter = {'option1': 
                {'country': 'Finland', 'store': 'KaggleRama', 'product':'Kaggle Hat'},
                'option2': 
                {'country': 'Finland', 'store': 'KaggleRama', 'product':'Kaggle Sticker'},
                'option3': 
                {'country': 'Finland', 'store': 'KaggleRama', 'product':'Kaggle Mug'},
                'option4': 
                {'country': 'Finland', 'store': 'KaggleMart', 'product':'Kaggle Hat'},
                'option5': 
                {'country': 'Finland', 'store': 'KaggleMart', 'product':'Kaggle Sticker'},
                'option6': 
                {'country': 'Finland', 'store': 'KaggleMart', 'product':'Kaggle Mug'},
                
                'option7': 
                {'country': 'Sweden', 'store': 'KaggleRama', 'product':'Kaggle Hat'},
                'option8': 
                {'country': 'Sweden', 'store': 'KaggleRama', 'product':'Kaggle Sticker'},
                'option9': 
                {'country': 'Sweden', 'store': 'KaggleRama', 'product':'Kaggle Mug'},
                'option10': 
                {'country': 'Sweden', 'store': 'KaggleMart', 'product':'Kaggle Hat'},
                'option11': 
                {'country': 'Sweden', 'store': 'KaggleMart', 'product':'Kaggle Sticker'},
                'option12': 
                {'country': 'Sweden', 'store': 'KaggleMart', 'product':'Kaggle Mug'},
                
                'option13': 
                {'country': 'Norway', 'store': 'KaggleRama', 'product':'Kaggle Hat'},
                'option14': 
                {'country': 'Norway', 'store': 'KaggleRama', 'product':'Kaggle Sticker'},
                'option15': 
                {'country': 'Norway', 'store': 'KaggleRama', 'product':'Kaggle Mug'},
                'option16': 
                {'country': 'Norway', 'store': 'KaggleMart', 'product':'Kaggle Hat'},
                'option17': 
                {'country': 'Norway', 'store': 'KaggleMart', 'product':'Kaggle Sticker'},
                'option18': 
                {'country': 'Norway', 'store': 'KaggleMart', 'product':'Kaggle Mug'},
    }
    
    
    
    for k, v in splitter.items():
        tmp_df = df[(df['country']==v['country']) & (df['store']==v['store']) 
                    & (df['product']==v['product'])][['date', 'num_sold']]
        
        tmp_df['date'] = pd.to_datetime(tmp_df['date'])
        tmp_df['date'] = tmp_df.date.dt.to_period('D')
        tmp_df = tmp_df.set_index(['date']).sort_index()

        average_sales = (
            tmp_df
            .groupby('date').mean()
            .squeeze()
        )


        X = average_sales.to_frame()
        X["week"] = X.index.week
        X["day"] = X.index.dayofweek
        X["dayofyear"] = X.index.dayofyear
        X["year"] = X.index.year

        fig, (ax0, ax1, ax2) = plt.subplots(3, 1, figsize=(11, 8))
        ax0.set_title(f"Country: {v['country']}, Product: {v['product']}, Store: {v['store']}");
        seasonal_plot(X, y="num_sold", period="week", freq="day", ax=ax0)
        seasonal_plot(X, y="num_sold", period="year", freq="dayofyear", ax=ax1);

        plot_periodogram(average_sales, ax=ax2);        

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
plot_seasonality(train_df)

**Preparing data and Splitting it into Train & Validation sets**

In [None]:
def get_lags(df, df_test):
    
    X = pd.DataFrame()
    X_rev = pd.DataFrame()
      
    splitter = {'option1': 
                {'country': 'Finland', 'store': 'KaggleRama', 'product':'Kaggle Hat'},
                'option2': 
                {'country': 'Finland', 'store': 'KaggleRama', 'product':'Kaggle Sticker'},
                'option3': 
                {'country': 'Finland', 'store': 'KaggleRama', 'product':'Kaggle Mug'},
                'option4': 
                {'country': 'Finland', 'store': 'KaggleMart', 'product':'Kaggle Hat'},
                'option5': 
                {'country': 'Finland', 'store': 'KaggleMart', 'product':'Kaggle Sticker'},
                'option6': 
                {'country': 'Finland', 'store': 'KaggleMart', 'product':'Kaggle Mug'},
                
                'option7': 
                {'country': 'Sweden', 'store': 'KaggleRama', 'product':'Kaggle Hat'},
                'option8': 
                {'country': 'Sweden', 'store': 'KaggleRama', 'product':'Kaggle Sticker'},
                'option9': 
                {'country': 'Sweden', 'store': 'KaggleRama', 'product':'Kaggle Mug'},
                'option10': 
                {'country': 'Sweden', 'store': 'KaggleMart', 'product':'Kaggle Hat'},
                'option11': 
                {'country': 'Sweden', 'store': 'KaggleMart', 'product':'Kaggle Sticker'},
                'option12': 
                {'country': 'Sweden', 'store': 'KaggleMart', 'product':'Kaggle Mug'},
                
                'option13': 
                {'country': 'Norway', 'store': 'KaggleRama', 'product':'Kaggle Hat'},
                'option14': 
                {'country': 'Norway', 'store': 'KaggleRama', 'product':'Kaggle Sticker'},
                'option15': 
                {'country': 'Norway', 'store': 'KaggleRama', 'product':'Kaggle Mug'},
                'option16': 
                {'country': 'Norway', 'store': 'KaggleMart', 'product':'Kaggle Hat'},
                'option17': 
                {'country': 'Norway', 'store': 'KaggleMart', 'product':'Kaggle Sticker'},
                'option18': 
                {'country': 'Norway', 'store': 'KaggleMart', 'product':'Kaggle Mug'},
    }
    
    def make_lags(ts, lags):
        return pd.concat(
            {
                f'y_lag_{i}': ts.shift(i)
                for i in range(1, lags + 1)
            },
            axis=1)
    def make_rev_lags(ts, steps):
        return pd.concat(
            {
                f'y_lag_{i + 1}': ts.shift(-i)
                for i in range(steps)},
            axis=1
        )
    
    for k, v in splitter.items():
        tmp_df = df[(df['country']==v['country']) & (df['store']==v['store']) 
                    & (df['product']==v['product'])]
        tmp_df_test = df_test[(df_test['country']==v['country']) & (df_test['store']==v['store']) 
                    & (df_test['product']==v['product'])]
        n = tmp_df_test.shape[0]
        
        lags = make_lags(tmp_df.num_sold, lags=7)
        rev_lags = make_rev_lags(tmp_df['num_sold'].tail(n), steps=7).dropna()
        
        tmp_df = pd.concat([tmp_df, lags], axis=1)
        X = pd.concat([X, tmp_df], axis=0)
        rev_lags = rev_lags.reset_index()
        tmp_df_test = tmp_df_test.join(rev_lags, how='left')
        X_rev = pd.concat([X_rev, tmp_df_test], axis=0)
        
    return X, X_rev

In [None]:
def get_features(df, train=True):
    df['date'] = pd.to_datetime(df['date'])
    
    # add holidays
    holidays = pd.read_csv('../input/holidays-finland-norway-sweden-20152019/Holidays_Finland_Norway_Sweden_2015-2019.csv')
    
    fin_hol = holidays[(holidays['Country']=='Finland')&(holidays['Fixed']==True)]['Date']
    swe_hol = holidays[(holidays['Country']=='Norway')&(holidays['Fixed']==True)]['Date']
    nor_hol = holidays[(holidays['Country']=='Sweden')&(holidays['Fixed']==True)]['Date']
    
    df['fin holiday'] = df.date.isin(fin_hol).astype(int)
    df['swe holiday'] = df.date.isin(swe_hol).astype(int)
    df['nor holiday'] = df.date.isin(nor_hol).astype(int)
    
    df['is_holiday'] = np.zeros(df.shape[0]).astype(int)
    df.loc[df.country == 'Finland', 'is_holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'is_holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'is_holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    
    # add month, weekday, is_weekend, dayofyear features
    df['month'] = df['date'].apply(lambda x: x.month)
    df['weekday'] = df['date'].apply(lambda x:x.weekday())
    df['is_weekend'] = df['date'].apply(lambda x:1 if x.weekday() > 4 else 0)
    df['day_of_year'] = df['date'].apply(lambda x: x.strftime('%j')).astype(int)
    df["week"] = df['date'].apply(lambda x: x.week)
    df["year"] = df['date'].apply(lambda x: x.year)
    
    # change categorical features into numerical
    df[['Finland','Norway','Sweden']] = pd.get_dummies(df['country'])
    df[['KaggleRama','KaggleMart']] = pd.get_dummies(df['store'])
    df[['KaggleMug','KaggleSticker', 'KaggleHat']] = pd.get_dummies(df['product'])
    
    # select final features 
    features = ['row_id', 'date', 'country', 'store', 'product','Finland','Norway','KaggleRama'
             ,'KaggleMug', 'KaggleHat', 'weekday', 'week', 'year',
               'is_weekend', 'month', 'day_of_year', 'is_holiday']
    
    if(train):
        features.append('num_sold')
        return df[features]
    else:
        return df[features]
    

In [None]:
# preprocess data and add seasonality components based on above charts

# from sklearn.preprocessing import RobustScaler
# robu = RobustScaler()

def compute_seasonality(df, df_test):
    x_index = 0
    x_out_index = 0
    
    cols = ['row_id', 'date', 'country', 'store', 'product','Finland','Norway','KaggleRama'
             ,'KaggleMug', 'KaggleHat', 'weekday', 'week', 'year',
               'is_weekend', 'month', 'day_of_year', 'is_holiday', 
                'y_lag_1','y_lag_2','y_lag_3','y_lag_4','y_lag_5','y_lag_6','y_lag_7',
            'const', 'trend', 's(2,7)', 's(3,7)', 's(4,7)', 's(5,7)', 's(6,7)',
       's(7,7)', 'sin(1,freq=A-DEC)', 'cos(1,freq=A-DEC)', 'sin(2,freq=A-DEC)',
       'cos(2,freq=A-DEC)', 'sin(3,freq=A-DEC)', 'cos(3,freq=A-DEC)',
       'sin(4,freq=A-DEC)', 'cos(4,freq=A-DEC)', 'sin(5,freq=A-DEC)',
       'cos(5,freq=A-DEC)']
    
    X = pd.DataFrame(columns=cols.append('num_sold'))
    X_out = pd.DataFrame(columns=cols)
    
    fourier = CalendarFourier(freq="A", order=5)  # 5 sin/cos pairs for "A"nnual seasonality
    
    splitter = {'option1': 
                {'country': 'Finland', 'store': 'KaggleRama', 'product':'Kaggle Hat',
                 'weekly': False, 'annual': [fourier]},
                'option2': 
                {'country': 'Finland', 'store': 'KaggleRama', 'product':'Kaggle Sticker',
                 'weekly': True, 'annual': []},
                'option3': 
                {'country': 'Finland', 'store': 'KaggleRama', 'product':'Kaggle Mug',
                 'weekly': True, 'annual': [fourier]},
                'option4': 
                {'country': 'Finland', 'store': 'KaggleMart', 'product':'Kaggle Hat',
                 'weekly': False, 'annual': [fourier]},
                'option5': 
                {'country': 'Finland', 'store': 'KaggleMart', 'product':'Kaggle Sticker',
                 'weekly': True, 'annual': []},
                'option6': 
                {'country': 'Finland', 'store': 'KaggleMart', 'product':'Kaggle Mug',
                 'weekly': True, 'annual': [fourier]},
                
                'option7': 
                {'country': 'Sweden', 'store': 'KaggleRama', 'product':'Kaggle Hat',
                 'weekly': False, 'annual': [fourier]},
                'option8': 
                {'country': 'Sweden', 'store': 'KaggleRama', 'product':'Kaggle Sticker',
                 'weekly': True, 'annual': []},
                'option9': 
                {'country': 'Sweden', 'store': 'KaggleRama', 'product':'Kaggle Mug',
                 'weekly': True, 'annual': [fourier]},
                'option10': 
                {'country': 'Sweden', 'store': 'KaggleMart', 'product':'Kaggle Hat',
                 'weekly': False, 'annual': [fourier]},
                'option11': 
                {'country': 'Sweden', 'store': 'KaggleMart', 'product':'Kaggle Sticker',
                 'weekly': True, 'annual': []},
                'option12': 
                {'country': 'Sweden', 'store': 'KaggleMart', 'product':'Kaggle Mug',
                 'weekly': True, 'annual': [fourier]},
                
                'option13': 
                {'country': 'Norway', 'store': 'KaggleRama', 'product':'Kaggle Hat',
                 'weekly': False, 'annual': [fourier]},
                'option14': 
                {'country': 'Norway', 'store': 'KaggleRama', 'product':'Kaggle Sticker',
                 'weekly': True, 'annual': []},
                'option15': 
                {'country': 'Norway', 'store': 'KaggleRama', 'product':'Kaggle Mug',
                 'weekly': True, 'annual': [fourier]},
                'option16': 
                {'country': 'Norway', 'store': 'KaggleMart', 'product':'Kaggle Hat',
                 'weekly': False, 'annual': [fourier]},
                'option17': 
                {'country': 'Norway', 'store': 'KaggleMart', 'product':'Kaggle Sticker',
                 'weekly': True, 'annual': []},
                'option18': 
                {'country': 'Norway', 'store': 'KaggleMart', 'product':'Kaggle Mug',
                 'weekly': True, 'annual': [fourier]},
    }
    
    for k, v in splitter.items():

        part_df = df[(df['country']==v['country']) & (df['store']==v['store']) 
                    & (df['product']==v['product'])]
        
        part_df_test = df_test[(df_test['country']==v['country']) & (df_test['store']==v['store']) 
                    & (df_test['product']==v['product'])]

        for col in part_df.columns:
            j = 0
            for i in range(x_index, (x_index+part_df.shape[0])):
                X.at[i, col] = part_df.iloc[j, part_df.columns.get_loc(col)]
                j=j+1
                
        for col in part_df_test.columns:
            j = 0
            for i in range(x_out_index, (x_out_index+part_df_test.shape[0])):
                X_out.at[i, col] = part_df_test.iloc[j, part_df_test.columns.get_loc(col)]
                j=j+1
                
        tmp_df = part_df[['date', 'num_sold']]

        tmp_df['date'] = tmp_df.date.dt.to_period('D')
        tmp_df = tmp_df.set_index(['date']).sort_index()

        tmp_df = (
            tmp_df
            .groupby('date').mean()
            .squeeze()
        )


        dp = DeterministicProcess(
            index=tmp_df.index,
            constant=True,               # dummy feature for bias (y-intercept)
            order=1,                     # trend (order 1 means linear)
            seasonal=v['weekly'],               # weekly seasonality (indicators)
            additional_terms=v['annual'],  # annual seasonality (fourier)
            drop=True,                   # drop terms to avoid collinearity
        )  


        dp_sample = dp.in_sample()
        dp_out_sample = dp.out_of_sample(steps=part_df_test.shape[0])

        for col in dp_sample.columns:
            j = 0
            for i in range(x_index, x_index+dp_sample.shape[0]):
                X.at[i, col] = dp_sample.iloc[j, dp_sample.columns.get_loc(col)]
                j = j+1
        
        for col in dp_out_sample.columns:
            j = 0
            for i in range(x_out_index, x_out_index+dp_out_sample.shape[0]):
                X_out.at[i, col] = dp_out_sample.iloc[j, dp_out_sample.columns.get_loc(col)]
                j = j+1
        
        
        x_index = x_index + dp_sample.shape[0]
        x_out_index = x_out_index + dp_out_sample.shape[0]

    return X, X_out

#### Fetch all features using above created functions

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

train_df = get_features(train_df, train=True)
test_df = get_features(test_df, train=False)

lags, rev_lags = get_lags(train_df, test_df)
lags = lags.fillna(0.0)
rev_lags = rev_lags.fillna(0.0)

X, X_out = compute_seasonality(lags, rev_lags)

In [None]:
#Sanity check to ensure everything is okay!
print(X.shape), print(train_df.shape)
print(X_out.shape), print(test_df.shape)

#### Prepare data for pycaret

In [None]:
X_out_1 = X_out.drop(['country', 'store', 'product'
                      , 'row_id', 'num_sold', 'index', 'date'], axis=1)
# X_out_1 = X_out_1.set_index(['date']).sort_index()

X_1 = X.drop(['country', 'store', 'product',
              'row_id', 'date'], axis=1)
# X_1 = X_1.set_index(['date']).sort_index()

#### Prepare data for manual algorithms

In [None]:
# # drop unneeded columns, split data to train and validation sets, and set date as an index
# y_train, y_val = X[X['year']<2018]['num_sold'], X[X['year']==2018]['num_sold']

# X_train, X_val = X[X['year']<2018], X[X['year']==2018]
# X_train.drop(['country', 'product', 'store', 'row_id', 'num_sold'], axis=1, inplace=True)
# X_val.drop(['country', 'product', 'store', 'row_id', 'num_sold'], axis=1, inplace=True)

# X_train = X_train.set_index(['date']).sort_index()
# X_val = X_val.set_index(['date']).sort_index()

# # apply same concept to test data
# X_test = X_out.drop(['country', 'product', 'store', 'row_id', 'num_sold', 'index'], axis=1)
# X_test = X_test.set_index(['date']).sort_index()

In [None]:
# replace NAs with zeros

# X_train.fillna(0.0, inplace=True), X_val.fillna(0.0, inplace=True), X_test.fillna(0.0, inplace=True)

X_1.fillna(0.0, inplace=True), X_out_1.fillna(0.0, inplace=True)

In [None]:
# # convert targets to ints
# y_train = y_train.astype(str).astype(int)
# y_val = y_val.astype(str).astype(int)

In [None]:
import re
# X_train = X_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
# X_val = X_val.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X_1 = X_1.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X_out_1 = X_out_1.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
#this is an aesthetic choice and just removes the many warnings that some functions and comands produce
#it helps significantly declutter the workbook
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%capture
!pip install pycaret[full]

In [None]:
from pycaret.regression import *

#setting up the pyCaret regression algorithm
reg = setup(data = X_1,
            target = 'num_sold',
            train_size = 0.75, #75:25 train/validation split
            normalize = True, #normalisation helps some algorithms
            normalize_method = 'robust', #resilient to outliers
            transform_target = True, #applies transformation to target column
            data_split_shuffle = False, #so that we do not use "future" observations to predict "past" observations
            #create_clusters = True, #adds additional feature by assigning clusters
            feature_interaction = True, #new features are created by interacting (a * b) all the numeric variables in the dataset
#             use_gpu = True, #use GPU acceleration to train models
            silent = True, #removes need for confirmation step
            fold = 15, #number of cross-fold validation folds
#             pca=True, #apply dimentionality reduction to data
#             session_id = 42, #set random seed
#             feature_selection=True, #a subset of features are selected using a combination of various permutation importance techniques 
            n_jobs = -1); #use all processor threads

In [None]:
#list all available models
models()

In [None]:
# Credit to https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
#adds the metric created previously to the pyCaret suite of metrics
add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better = False)

In [None]:
#compares all models available and returns top N models to then be used
N = 3
top = compare_models(sort = 'SMAPE', n_select = N)

In [None]:
#Voting Classifier that blends predictions of individial models
#only uses training set and predicts on valiation set
blend_voting = blend_models(top)
predict_model(blend_voting);

In [None]:
# use test and validation to train model and predicts on validation set
final_blend_voting = finalize_model(blend_voting)
predict_model(final_blend_voting);

In [None]:
voting_preds = predict_model(final_blend_voting, data=X_out_1)
voting_preds.head()

In [None]:
preds = (voting_preds['Label']/2).round()

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')

submission['num_sold'] = preds

submission.to_csv("submission.csv", index=False)

In [None]:
# try:
#     import xgboost
# except ImportError as ex:
#     print("Error: the xgboost library is not installed.")
#     xgboost = None
    
# blend_stack = stack_models(top, meta_model=xgboost.XGBRegressor())
# predict_model(blend_stack);

In [None]:
# # use test and validation to train model and predicts on validation set
# final_blend_stack = finalize_model(blend_stack)
# predict_model(final_blend_stack);

In [None]:
# stack_preds = predict_model(final_blend_stack, data=X_test)
# stack_preds.head()

**Trying Stacking Ensemble Technique, with Ridge, SVR & Random Forest**

In [None]:
# # Applying Stacking Regression to predict values
# from sklearn.linear_model import RidgeCV, HuberRegressor
# # from sklearn.svm import LinearSVR
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import StackingRegressor
# from catboost import CatBoostRegressor

# try:
#     import lightgbm as ltb
# except:
#     print("Error: the lightgbm library is not installed.")
#     ltb = None

# try:
#     import xgboost
# except ImportError as ex:
#     print("Error: the xgboost library is not installed.")
#     xgboost = None

    
# estimators = [
#     ('xgb', xgboost.XGBRegressor(random_state=42, n_estimators=1000, learning_rate=0.15, max_depth=4)),
# #     ('ridge', RidgeCV()),
# #     ('hb', HuberRegressor(fit_intercept=False, epsilon=1.20, max_iter=500)),
# #     ('svr', LinearSVR(random_state=42)),
#     ('lgb', ltb.LGBMRegressor(objective='regression', n_estimators=1000, random_state=42)),
#     ('cat', CatBoostRegressor(silent=True)),
#     ('rf', RandomForestRegressor(n_estimators=500,random_state=42))
# ]
# reg = StackingRegressor(
#     estimators=estimators,
#     final_estimator=RidgeCV()
# )

# reg.fit(X_train, y_train).score(X_val, y_val)

**Trying Voting Ensemble Technique, with Ridge, XGBoost, SVR & Random Forest**

In [None]:
# from sklearn.ensemble import VotingRegressor

# estimators = [
#     ('rf', RandomForestRegressor(n_estimators=500,random_state=42)),
# #     ('hb', HuberRegressor(fit_intercept=False, epsilon=1.20, max_iter=500)),
# #     ('svr', LinearSVR(random_state=42)),
#     ('lgb', ltb.LGBMRegressor(objective='regression', n_estimators=1000, random_state=42)),
#     ('cat', CatBoostRegressor(silent=True)),
#     ('xgb_reg', xgboost.XGBRegressor(random_state=42, n_estimators=1000, learning_rate=0.15, max_depth=4))
# ]

# vot_reg = VotingRegressor(estimators)

# vot_reg.fit(X_train, y_train).score(X_val, y_val)

**From above, XGBoost produced the best score, next, I will try out different hyperparameters**

In [None]:
# # Getting best number of estimators
# from sklearn.model_selection import GridSearchCV

# n_estimators = [500, 1500, 1000, 2000]
# learning_rate = [0.01, 0.1, 0.15, 0.05]

# model = xgboost.XGBRegressor(random_state=42)

# parameters = {'learning_rate': learning_rate,
#               'max_depth': [5, 6, 7],
#               'n_estimators': n_estimators}

# xgb_grid = GridSearchCV(model,
#                         parameters,
#                         cv = 2,
#                         n_jobs = -1,
#                         verbose=True)

# grid_result = xgb_grid.fit(X_train, y_train)

# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# # means = grid_result.cv_results_['mean_test_score']
# # stds = grid_result.cv_results_['std_test_score']
# # params = grid_result.cv_results_['params']
# # for mean, stdev, param in zip(means, stds, params):
# #     print("%f (%f) with: %r" % (mean, stdev, param))

#### Trying XGBoost alone

In [None]:
# xgb = xgboost.XGBRegressor(random_state=42, n_estimators=1000, learning_rate=0.15, max_depth=5)
# print(xgb.fit(X_train, y_train).score(X_val, y_val))

In [None]:
# preds = xgb.predict(X_test).round()
# preds

In [None]:
# submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')

# submission['num_sold'] = preds

# submission.to_csv("submission.csv", index=False)