# 二値分類やる
    
## Base Model

LightGBM : https://www.kaggle.com/girmdshinsei/for-japanese-beginner-with-wrmsse-in-lgbm

## Custom Objective

LightGBM : https://www.kaggle.com/girmdshinsei/for-japanese-beginner-with-wrmsse-in-lgbm

## Features

https://www.kaggle.com/kyakovlev/m5-simple-fe  
https://www.kaggle.com/kyakovlev/m5-custom-features  
https://www.kaggle.com/kyakovlev/m5-lags-features

## Consideration of Evaluation Function

### RMSSE
$$
RMSSE = \frac{\sqrt{\frac{1}{h}\sum_{t=n+1}^{n+h}(Y_t - \hat{Y}_t)^2}}{\sqrt{\frac{1}{n-1}\sum_{t=2}^{n}(Y_t - Y_{t-1})^2}} = \frac{RMSE(\mathrm{model})}{RMSE(\mathrm{naive\ model})}
$$
これはナイーブな予測モデル$\hat{Y}_t = Y_{t-1}$のRMSEと、実際の予測モデルのRMSEの比とみなせる。
This can be regarded as the ratio of the RMSE of the naive prediction model $ \hat{Y}_t = Y_{t-1} $ and the RMSE of our prediction model.


RMSSEを採用する理由は以下である。
- ゼロが多いため、単にRMSEを指標とするとゼロを予測するモデルのスコアが低くなる。したがって予測モデルがナイーブなモデルと比較し、どれだけ優れているかという指標を用いる。
- scale independent
- ゼロ割の心配がない
- 正方向の誤差と負方向の誤差が対称に見積もられる


RMSSEの代わりに上記の条件を満たす以下を定義し用いる：
 we define a new RMSSE that extends above:
$$
RMSSE\_AllDAY = \frac{\sqrt{\frac{1}{h}\sum_{day_in_validation}(Y_t - \hat{Y}_t)^2}}{\sqrt{\frac{1}{n-1}\sum_{fixed\_section}(Y_t - Y_{t-1})^2}} = \frac{\sqrt{\frac{1}{h}\sum_{day_in_validation}(Y_t - \hat{Y}_t)^2}}{\sqrt{\frac{1}{n-1}\sum_{t = 2}^{1913}(Y_t - Y_{t-1})^2}}
$$
RMSSE\_Bはある区間のデータからナイーブモデルのRMSEを計算する。

## import

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import lightgbm as lgb
import optuna.integration.lightgbm as lgb_optuna
#import dask_xgboost as xgb
#import dask.dataframe as dd6
from sklearn import preprocessing, metrics
from sklearn.preprocessing import LabelEncoder
import gc
import os
from tqdm import tqdm_notebook as tqdm
from scipy.sparse import csr_matrix
import pickle

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Use Files

In [None]:
PATHS = {}
for store_id in ['CA_1','CA_2','CA_3','CA_4','TX_1','TX_2','TX_3','WI_1','WI_2','WI_3']:
    PATHS[store_id] = '/kaggle/input/m5-all-data/df_' + store_id + '.pkl'

## Define Features

In [None]:
TARGET = 'sales_binary'

basic_features = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 
        'release', 'sell_price', 'price_max', 'price_min', 'price_std',
       'price_mean', 'price_norm', 'price_nunique', 'item_nunique',
       'price_momentum', 'price_momentum_m', 'price_momentum_y',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'tm_d', 'tm_w', 'tm_m', 'tm_y',
       'tm_wm', 'tm_dw', 'tm_w_end']

encoding_features = [
       'te_id_60', 'te_item_id_60',
       'te_dept_id_60', 'te_cat_id_60', 'te_store_id_60', 'te_state_id_60',
       'te_id_tm_dw_60', 'te_item_id_tm_dw_60', 'te_dept_id_tm_dw_60',
       'te_cat_id_tm_dw_60', 'te_store_id_tm_dw_60', 'te_state_id_tm_dw_60'
]

lag_features = [
        'sales_lag_28','sales_lag_29', 'sales_lag_30', 'sales_lag_31', 'sales_lag_32',
        'sales_lag_33', 'sales_lag_34', 'sales_lag_35', 'sales_lag_36','sales_lag_37',
        'sales_lag_38', 'sales_lag_39', 'sales_lag_40','sales_lag_41', 'sales_lag_42', 
        'rolling_mean_7', 'rolling_std_7','rolling_mean_14', 'rolling_std_14',
        'rolling_mean_30','rolling_std_30', 'rolling_mean_60', 'rolling_std_60',
        'rolling_mean_180', 'rolling_std_180'
]

recursive_features =[
    'rolling_mean_tmp_1_7','rolling_mean_tmp_1_14',
    'rolling_mean_tmp_1_30','rolling_mean_tmp_1_60',
    'rolling_mean_tmp_7_7','rolling_mean_tmp_7_14',
    'rolling_mean_tmp_7_30','rolling_mean_tmp_7_60',
    'rolling_mean_tmp_14_7','rolling_mean_tmp_14_14',
    'rolling_mean_tmp_14_30','rolling_mean_tmp_14_60'
]

additional_features = ['binary_pred']

remove_features = ['store_id', 'state_id', 'te_store_id_60', 'te_state_id_60', 'te_store_id_tm_dw_60', 'te_state_id_tm_dw_60','snap_CA', 'snap_TX', 'snap_WI']


use_enc_feat = True
use_lag_feat = True
use_rec_feat = False
use_add_feat = False

feature = basic_features
if use_enc_feat:
    feature += encoding_features
if use_lag_feat:
    feature += lag_features
if use_rec_feat:
    feature += recursive_features
if use_add_feat:
    feature += additional_features
    
feature = [i for i in feature if i not in remove_features]
    
len(feature)

## Model

In [None]:
# define lgbm simple model using custom loss and eval metric for early stopping
def run_lgb_binary(train, val, features, no_early_stopping = False):

    train_set = lgb.Dataset(train[features], train[TARGET].values)
    del train
    gc.collect()
    
    val_set = lgb.Dataset(val[features], val[TARGET].values)
    del val
    gc.collect()

    params = {
            'boosting_type': 'gbdt',
            'first_metric_only': True,
            'objective': 'binary',
            'metric': 'auc',
            'n_jobs': -1,
            'seed': 42,
            'learning_rate': 0.1,
            'bagging_fraction': 0.75,
            'bagging_freq': 10, 
            'colsample_bytree': 0.75}
    if no_early_stopping:
        model = lgb.train(params, train_set, num_boost_round = 200, 
                      valid_sets = [train_set], valid_names=['Train'], 
                      verbose_eval = 10)        
    else: 
        model = lgb.train(params, train_set, num_boost_round = 1500, early_stopping_rounds = 100, 
                      valid_sets = [train_set, val_set], valid_names=['Train','Val'], 
                      verbose_eval = 10)
    return model


## Preparation

### Training Parameters

In [None]:
test_start_date = 1970
test_end_date = 1970

train_width_date = 365
val_width_date = 27
shift_width_date = 28
min_train_date = 0

slide_list = []
for i in range(test_start_date-1,1,-shift_width_date):
    end_date = i
    split_date = end_date - val_width_date
    start_date = split_date - train_width_date
    if start_date < min_train_date:
        break
    slide_list.append([start_date,split_date,end_date])

In [None]:
slide_list

## Make Parameter Grid

## Training

In [None]:
product = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
product = product[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
STORE_IDS = list(product.store_id.unique())
del product
gc.collect()

In [None]:
te2 = pd.read_pickle('../input/m5-target-encoding2/te_60.pkl')

In [None]:
te2.columns

In [None]:
add_feat = ['te_id_28', 'te_item_id_28', 'te_dept_id_28', 'te_cat_id_28',
       'te_store_id_28', 'te_state_id_28', 'te_id_tm_dw_28',
       'te_item_id_tm_dw_28', 'te_dept_id_tm_dw_28', 'te_cat_id_tm_dw_28',
       'te_store_id_tm_dw_28', 'te_state_id_tm_dw_28']

In [None]:
gc.collect()

In [None]:
for store_id in STORE_IDS[0:2]:
    df_binary = pd.DataFrame([],columns = ['id','d','binary_pred'])
    first = True
    for start_date, split_date, end_date in tqdm(slide_list):
        print('start_date, split_date, end_date:',start_date, split_date, end_date)
        print('store_id:',store_id)
        print('load dataset')
        df = pd.read_pickle(PATHS[store_id])
        df['sales_binary'] = np.where(df.sales==0,0,1)
        df = df.merge(te2,on=['id', 'd'],how='left')
        day_mask = (df.d>=start_date)&(df.d<split_date)
        train = df[day_mask]
        train_ids = train.id.unique()
        NUM_ITEMS = len(train_ids)

        day_mask = (df.d>=split_date)&(df.d<=end_date)
        val = df[day_mask]
        val = val[val.id.isin(train_ids)]
        del df
        gc.collect()

        model = run_lgb_binary(train, val, feature, first)
        pred = model.predict(val[feature])
        tmp = val[['id','d']]
        tmp['binary_pred'] = pred
        df_binary = pd.concat([df_binary,tmp])
        first = False

    df_binary = reduce_mem_usage(df_binary)
    df_binary.to_pickle('binary_pred_' + store_id + '.pkl')
    
    
    