## Credits:
*  We used the notebook: https://www.kaggle.com/rohitsingh9990/m5-lgbm-fe as a baseline for this notebook.
*  Wrmsse objective function code from: https://www.kaggle.com/girmdshinsei/for-japanese-beginner-with-wrmsse-in-lgbm/notebook

In [None]:
import pandas as pd
import os
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import preprocessing, metrics
import gc
import joblib
import warnings
from sklearn.neighbors import KNeighborsRegressor
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
import datetime
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [None]:
INPUT_DIR_PATH = '../input/mlip-daemencloudt-feature-computation-notebook/'

# Functions

In [None]:
def print_dir(path):
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            print(os.path.join(dirname, filename))


### Functions for val/train split

In [None]:
def train_test_split(data):
     
    # going to evaluate with the last 28 days
    x_train = data[data['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_val = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    y_val = x_val['demand']
    test = data[(data['date'] > '2016-04-24')]
    del data
    gc.collect()
    return x_train, y_train, x_val, y_val, test

### Functions for running LightGBM

In [None]:
def run_lgb(x_train, y_train, x_val, y_val, test):
    
    params = {
        #'boosting_type': 'dart',
        'metric': 'custom',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1, 
        'seed': 20,
        'learning_rate': 0.1,
        'num_iterations':2000,
        'max_bin':500,
        'min_data_in_leaf' : 100,
        'feature fraction': 0.5, 
        'num_leaves': 100,
        'bagging_fraction': 0.3,
        'bagging_freq': 1
        }
    
    evals_result = {}

    train_set = lgb.Dataset(x_train[features], y_train)
    val_set = lgb.Dataset(x_val[features], y_val)
    
    del x_train, y_train
    
    model = lgb.train(params, train_set,early_stopping_rounds = 250, valid_sets = [train_set, val_set], verbose_eval = 10, feval= wrmsse,\
                       evals_result=evals_result)
    
    ax = lgb.plot_metric(evals_result)
    plt.show()
    
    val_pred = model.predict(x_val[features], num_iteration=model.best_iteration)
    y_pred = model.predict(test[features], num_iteration=model.best_iteration)
    test['demand'] = y_pred
    del y_pred 
    gc.collect()
    return test, model

In [None]:
def rmse(predictions, y):
    return np.sqrt(mean_squared_error(y, predictions))

### Function for the submission

In [None]:
def predict(test, submission):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    final.to_csv('submission_new.csv', index = False)


## Global variables

In [None]:
features = [
    "item_id", "dept_id", "cat_id", "store_id", "state_id", "event_name_1", "event_type_1", "snap_CA", "snap_TX", \
    "snap_WI", "sell_price", \
    # demand features.
    "shift_t28",   \
     "rolling_mean_28_7","rolling_mean_28_28",  \
    # price features
    "price_change_t1", 
    # time features.
    "year", "month", "dayofweek", "before_holiday", "after_holiday"
]




print(f'length: {len(features)}')
print(features)

# Main part

### Loading the preprocessed data our feature computation notebook: https://www.kaggle.com/hingencity/mlip-daemencloudt-feature-computation-notebook 

In [None]:
data = pd.read_pickle(f'{INPUT_DIR_PATH}Features{1}.pkl')
for i in range(2,6):
    data = pd.concat([data, pd.read_pickle(f'{INPUT_DIR_PATH}Features{i}.pkl')])
data.shape

# Japanese WRMSSE
The wrmsse code of the japanese beginner notebook: https://www.kaggle.com/girmdshinsei/for-japanese-beginner-with-wrmsse-in-lgbm/notebook to add a wrmsse metric to the LightGBM tree

In [None]:
sales_train_val = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')

NUM_ITEMS = sales_train_val.shape[0]

submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
DAYS_PRED = submission.shape[1] - 1  # 28
product = sales_train_val[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()


In [None]:
weight_mat = np.c_[np.ones([NUM_ITEMS,1]).astype(np.int8), # level 1
                   pd.get_dummies(product.state_id.astype(str),drop_first=False).astype('int8').values,
                   pd.get_dummies(product.store_id.astype(str),drop_first=False).astype('int8').values,
                   pd.get_dummies(product.cat_id.astype(str),drop_first=False).astype('int8').values,
                   pd.get_dummies(product.dept_id.astype(str),drop_first=False).astype('int8').values,
                   pd.get_dummies(product.state_id.astype(str) + product.cat_id.astype(str),drop_first=False).astype('int8').values,
                   pd.get_dummies(product.state_id.astype(str) + product.dept_id.astype(str),drop_first=False).astype('int8').values,
                   pd.get_dummies(product.store_id.astype(str) + product.cat_id.astype(str),drop_first=False).astype('int8').values,
                   pd.get_dummies(product.store_id.astype(str) + product.dept_id.astype(str),drop_first=False).astype('int8').values,
                   pd.get_dummies(product.item_id.astype(str),drop_first=False).astype('int8').values,
                   pd.get_dummies(product.state_id.astype(str) + product.item_id.astype(str),drop_first=False).astype('int8').values,
                   np.identity(NUM_ITEMS).astype(np.int8) #item :level 12
                   ].T

weight_mat_csr = csr_matrix(weight_mat)
del weight_mat; gc.collect()

def weight_calc(data, product, sales_train_val):

    d_name = ['d_' + str(i+1) for i in range(1913)]

    sales_train_val = weight_mat_csr * sales_train_val[d_name].values

    # calculate the start position(first non-zero demand observed date) for each item 
    # 1-1914 day 
    df_tmp = ((sales_train_val>0) * np.tile(np.arange(1,1914),(weight_mat_csr.shape[0],1)))

    start_no = np.min(np.where(df_tmp==0,9999,df_tmp),axis=1)-1

    flag = np.dot(np.diag(1/(start_no+1)) , np.tile(np.arange(1,1914),(weight_mat_csr.shape[0],1)))<1

    sales_train_val = np.where(flag,np.nan,sales_train_val)

    # denominator of RMSSE / RMSSE
    weight1 = np.nansum(np.diff(sales_train_val,axis=1)**2,axis=1)/(1913-start_no)

    # calculate the sales amount for each item/level
    df_tmp = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    df_tmp['amount'] = df_tmp['demand'] * df_tmp['sell_price']
    df_tmp =df_tmp.groupby(['id'])['amount'].apply(np.sum)
    df_tmp = df_tmp[product.id].values
    
    weight2 = weight_mat_csr * df_tmp 

    weight2 = weight2/np.sum(weight2)

    del sales_train_val
    gc.collect()
    
    return weight1, weight2

weight1, weight2 = weight_calc(data,product,sales_train_val)

def wrmsse(preds, data):
    
    # this function is calculate for last 28 days to consider the non-zero demand period
    
    # actual obserbed values 
    y_true = data.get_label()
    
    y_true = y_true[-(NUM_ITEMS * DAYS_PRED):]
    preds = preds[-(NUM_ITEMS * DAYS_PRED):]
    # number of columns
    num_col = DAYS_PRED
    
    # reshape data to original array((NUM_ITEMS*num_col,1)->(NUM_ITEMS, num_col) ) 
    reshaped_preds = preds.reshape(num_col, NUM_ITEMS).T
    reshaped_true = y_true.reshape(num_col, NUM_ITEMS).T
    
          
    train = weight_mat_csr*np.c_[reshaped_preds, reshaped_true]
    
    score = np.sum(
                np.sqrt(
                    np.mean(
                        np.square(
                            train[:,:num_col] - train[:,num_col:])
                        ,axis=1) / weight1) * weight2)
    
    return 'wrmsse', score, False

def wrmsse_simple(preds, data):
    
    # actual obserbed values 
    y_true = data.get_label()
    
    y_true = y_true[-(NUM_ITEMS * DAYS_PRED):]
    preds = preds[-(NUM_ITEMS * DAYS_PRED):]
    # number of columns
    num_col = DAYS_PRED
    
    # reshape data to original array((NUM_ITEMS*num_col,1)->(NUM_ITEMS, num_col) ) 
    reshaped_preds = preds.reshape(num_col, NUM_ITEMS).T
    reshaped_true = y_true.reshape(num_col, NUM_ITEMS).T
          
    train = np.c_[reshaped_preds, reshaped_true]
    
    weight2_2 = weight2[:NUM_ITEMS]
    weight2_2 = weight2_2/np.sum(weight2_2)
    
    score = np.sum(
                np.sqrt(
                    np.mean(
                        np.square(
                            train[:,:num_col] - train[:,num_col:])
                        ,axis=1) /  weight1[:NUM_ITEMS])*weight2_2)
    
    return 'wrmsse', score, False

In [None]:
x_train, y_train, x_val, y_val, test = train_test_split(data)
del data

### Run LightGBM

In [None]:
test, model = run_lgb(x_train, y_train, x_val, y_val, test)

In [None]:

    #Plotting feature importances

    
    #Plotting feature importances
ax = lgb.plot_importance(model, max_num_features=23, importance_type='split')
plt.show()
    
    #Plotting feature importances
ax = lgb.plot_importance(model, max_num_features=23, importance_type='gain')
plt.show()
    
ax = lgb.plot_tree(model, tree_index=53, figsize=(15, 15), show_info=['split_gain'])
plt.show()
    

### Predict and submit

In [None]:
import gc
del x_train, y_train, x_val, y_val
del weight_mat_csr
#del model
gc.collect()

In [None]:
predict(test, submission)