# Download all requirements (pip install)

In [None]:
!pip install scikit-learn

# Import all libs

In [None]:
import numpy as np
import pandas as pd

import sklearn.metrics as metrics
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

# Download data and analysis it

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')

In [None]:
data.head()

> We have 18 different date series. Features **[country,store,product]** we can delete, because they just say which date series.

> We extract feature from date using **pandas.Series.dt**




# Define new functions for calculus and output metrics

In [None]:
def regression_results(y_true, y_pred):
    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [None]:
def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score
rmse_score = make_scorer(rmse, greater_is_better = False)

# Train model

I create 18 ensembles (for each date series) of:


*   **RandomForestRegressor**
*   **MLPRegressor**
*   **GradientBoostingRegressor**

*Final estimator:* **RidgeCV**

In [None]:
NUM_SEED = 23
models = []
for i in ['Finland', 'Norway', 'Sweden']:
    for j in ['KaggleMart', 'KaggleRama']:
        for l in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
            df = data[(data['country'] == i) & (data['store']==j) & (data['product']==l)][['date','num_sold']]
            df['date'] = pd.to_datetime(df['date'])
            df['year'] = df.date.dt.year
            df['month'] = df.date.dt.month
            df['day'] = df.date.dt.day
            df['dayofyear'] = df.date.dt.dayofyear
            df['week'] = df.date.dt.isocalendar().week
            df['dayofweek'] = df.date.dt.dayofweek
            df['quater'] = df.date.dt.quarter
            df = df.dropna()
            df = df.set_index('date')
            X_train = df[:'2017'].drop(['num_sold'], axis = 1)
            y_train = df.loc[:'2017', 'num_sold']
            X_test = df['2018'].drop(['num_sold'], axis = 1)
            y_test = df.loc['2018', 'num_sold']
            estimators = [
                          ('rf', RandomForestRegressor(n_estimators=100, n_jobs=-1, max_depth=10, random_state=NUM_SEED)),  
                          ('mlp', MLPRegressor(hidden_layer_sizes=(300,), max_iter=5000,random_state=NUM_SEED)), 
                          ('gbr', GradientBoostingRegressor(learning_rate=1e-2, loss='absolute_error',max_depth=10, n_estimators=300, random_state=NUM_SEED))]
            model = StackingRegressor(estimators=estimators)
            param_search = { 
                'cv': [5]
            }
            tscv = TimeSeriesSplit(n_splits=10)
            gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = rmse_score, n_jobs=-1)
            X_train.head()
            gsearch.fit(X_train, y_train)
            best_score = gsearch.best_score_
            best_model = gsearch.best_estimator_
            models.append(best_model)
            y_true = y_test.values
            y_pred = best_model.predict(X_test)
            print(f'''Res-ts for {i}, {j}, {l}''')
            regression_results(y_true, y_pred)

# Predict test

In [None]:
k = 0
datatest = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
for i in ['Finland', 'Norway', 'Sweden']:
    for j in ['KaggleMart', 'KaggleRama']:
        for l in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
            df = datatest[(datatest['country'] == i) & (datatest['store']==j) & (datatest['product']==l)][['date']]
            df['date'] = pd.to_datetime(df['date'])
            df['year'] = df.date.dt.year
            df['month'] = df.date.dt.month
            df['day'] = df.date.dt.day
            df['dayofyear'] = df.date.dt.dayofyear
            df['week'] = df.date.dt.isocalendar().week
            df['dayofweek'] = df.date.dt.dayofweek
            df['quater'] = df.date.dt.quarter
            df = df.dropna()
            df = df.set_index('date')
            X_test = df
            model = models[k]
            k = k + 1
            y_pred = model.predict(X_test)
            datatest.loc[(datatest['country']==i) & (datatest['store']==j) & (datatest['product']==l),'num_sold'] = y_pred
datatest[['row_id', 'num_sold']].to_csv('submission.csv', index=False)

In [None]:
datatest[['row_id', 'num_sold']]