In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Engineering

You can start feature engineering quickly by The function 'feat_eng'(df)'

df is train.csv or test.csv

## Data
* country
* store
* product
* (num_sold)
* holiday (By Country)
* year
* quarter
* month
* day
* week


In [None]:
df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')

In [None]:
df

In [None]:
def feat_eng(df):
    countries = {'Finland': 1, 'Norway': 2, 'Sweden': 3}
    stores = {'KaggleMart': 1, 'KaggleRama': 2}
    products = {'Kaggle Mug': 1,'Kaggle Hat': 2, 'Kaggle Sticker': 3}
    
    # load holiday info.
    holiday = pd.read_csv('../input/public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv')
    
    fin_holiday = holiday.loc[holiday.country == 'Finland']
    swe_holiday = holiday.loc[holiday.country == 'Sweden']
    nor_holiday = holiday.loc[holiday.country == 'Norway']
    df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['week']= df['date'].dt.weekday
    df['country'] = df['country'].replace(countries)
    df['store'] = df['store'].replace(stores)
    df['product'] = df['product'].replace(products)
    df = df.drop(columns = 'row_id')
    df = df.drop(columns = 'date')
    return df

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')

In [None]:
df_train = feat_eng(df)

In [None]:
df_train

In [None]:
train_y = df_train['num_sold']
train_x = df_train[['country',
                   'store',
                   'product',
                   'holiday',
                   'year',
                   'quarter',
                   'month',
                   'day',
                   'week']]

In [None]:
print(type(train_x))

In [None]:
train_x

In [None]:
print(type(train_y))

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
SEED   = 42

In [None]:
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import KFold
import xgboost as xgb
from xgboost import XGBRegressor

kf = KFold(n_splits = 4, shuffle = True, random_state = 71)
fold = 1
for tr_idx, va_idx in kf.split(train_x):
    print(f'--------fold:{fold}--------')
    fold+=1
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    dtrain = xgb.DMatrix(tr_x, label = tr_y)
    dvalid = xgb.DMatrix(va_x, label = va_y)

    xgboost_params = {'eta'              : 0.1,
                  'n_estimators'     : 16384,
                  'max_depth'        : 8,
                  'max_leaves'       : 256,
                  'colsample_bylevel': 0.75,
                  'colsample_bytree' : 0.75,
                  'subsample'        : 0.75, # XGBoost would randomly sample 'subsample_value' of the training data prior to growing trees
                  'min_child_weight' : 512,
                  'min_split_loss'   : 0.002,
                  'alpha'            : 0.08,
                  'lambda'           : 128,
                  'objective'        : 'reg:squarederror',
                  'eval_metric'      : 'rmse', # Originally using RMSE, trying new functions...
                  'seed'             : SEED
                  }
    
    watch_list = [(dtrain, 'train'), (dvalid, 'eval')]
    model = XGBRegressor(**xgboost_params)
    
    va_pred = model.fit(tr_x,
              tr_y,
              eval_set=[(va_x, va_y)],
              early_stopping_rounds = 250,
              verbose = 500)
    val_pred = model.predict(va_x)
    # Convert the target back to non-logaritmic.
    print(f' SMAPE: {SMAPE(va_y, val_pred)}')

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

In [None]:
test = feat_eng(test)

In [None]:
test

In [None]:
y = model.predict(test)

In [None]:
a =pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv') 

In [None]:
a['num_sold'] = y

In [None]:
a.to_csv('./submission.csv', index = False)