## 0-Import Library

Kfold & Catboost
LB:5.94134

In [None]:
import numpy as np
import pandas as pd

## 1-Feature Engineering

You can start feature engineering quickly by The function 'feat_eng'(df)'

df is train.csv or test.csv

### Data
* country
* store
* product
* (num_sold)
* holiday (By Country)
* year
* dayofyear
* quarter
* month
* day
* week
* GDP_value



In [None]:
df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')

In [None]:
df

In [None]:
mean = df['num_sold'].mean()
std = df['num_sold'].std()

mean,std

In [None]:
def feat_eng(df):
    countries = {'Finland': 0, 'Norway': 1, 'Sweden': 2}
    stores = {'KaggleMart': 0, 'KaggleRama': 1}
    products = {'Kaggle Mug': 0,'Kaggle Hat': 1, 'Kaggle Sticker': 2}
    
    # load holiday info.
    holiday = pd.read_csv('../input/public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv')
    GDP = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv', index_col="year")
    population = pd.read_csv('../input/population-20152019-finland-norway-sweden/population_2015-2019_Finland_Norway_Sweden.csv',index_col = 'year')
    fin_holiday = holiday.loc[holiday.country == 'Finland']
    swe_holiday = holiday.loc[holiday.country == 'Sweden']
    nor_holiday = holiday.loc[holiday.country == 'Norway']
    df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['dayofmonth'] = df['date'].dt.days_in_month
    df['day'] = df['date'].dt.day
    df['week']= df['date'].dt.weekday
    df['country'] = df['country'].replace(countries)
    df['store'] = df['store'].replace(stores)
    df['product'] = df['product'].replace(products)
    df = df.drop(columns = 'row_id')
    df = df.drop(columns = 'date')
    
    # GDP columns
    GDP.columns = [0,1,2]
    GDP_dictionary = GDP.unstack().to_dict()
    df["GDP_value"] = df.set_index(['country','year']).index.map(GDP_dictionary.get)
    df["GDP_value"] = df["GDP_value"]
    
    population.columns = [0,1,2]
    population_dictionary = population.unstack().to_dict()
    df["population"] = df.set_index(['country','year']).index.map(population_dictionary.get)

    
    return df


In [None]:
df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')

In [None]:
df_train = feat_eng(df)
df_train['num_sold'] = np.log(df_train['num_sold'])
df_train['population'].describe()

.descriv## 2-CrossValidation & CatBoost

In [None]:
train_y = df_train['num_sold']
train_x = df_train[['country',
                   'store',
                   'product',
                   'holiday',
                   'year',
                   'dayofyear',
                   'quarter',
                   'month',
                    'dayofmonth',
                   'day',
                   'week',
                    'GDP_value',
                   'population'
                   ]
                  ]

In [None]:
train_x

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import KFold,TimeSeriesSplit
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


# fold5
kf = KFold(n_splits = 5, shuffle = True, random_state = 70)
x = 0.95
# modeling and training
for fold, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
    print(f'--------fold:{fold}--------')
    fold+=1
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    params = {'depth': 5,
                  'learning_rate': 0.001,
                  'l2_leaf_reg': 5.0,
                  'random_strength': 3.0,
                  'min_data_in_leaf': 2}
                  
    model = CatBoostRegressor(**params,
                              iterations=20000,
                              bootstrap_type='Bayesian',
                              boosting_type='Plain',
                              loss_function='MAE',
                              eval_metric='SMAPE',
                              random_seed=5)
    # Training the model
    
    va_pred = model.fit(tr_x,
              tr_y,
              eval_set=[(va_x, va_y)],
              early_stopping_rounds = 200,
              verbose = 1000)
    val_pred = model.predict(va_x)
    # Convert the target back to non-logaritmic.
    print(f' SMAPE: {SMAPE(np.exp(va_y), np.exp(val_pred))}')


## 3-Submission

In [None]:
df

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
test = feat_eng(test)
y = model.predict(test)
df_submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv') 
df_submission['num_sold'] = np.exp(y)
df_submission.to_csv('./submission.csv', index = False)