In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import os, sys, gc, time, warnings, pickle, psutil, random
from multiprocessing import Pool
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
pd.set_option('display.max_columns', 500)
mlflow.set_tracking_uri('http://mlflow:5000')

In [None]:
# sales = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv').drop([f'd_{i}' for i in range(1, 1886)], axis=1)
# sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
# calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')

# sales = sales.sort_values('id').reset_index(drop=True)
# df = pd.melt(sales, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'd', value_name = 'sales')
# df = df.sort_values(['d', 'id'])
# df = df.merge(calendar[['d', 'wm_yr_wk']], on=['d']).merge(sell_prices, on=['item_id', 'store_id', 'wm_yr_wk'])
# df = df.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'wm_yr_wk'], axis=1)
# df['d'] = df['d'].apply(lambda x: x.split('_')[-1]).astype(int)
# df['total_sales'] = df['sales'] * df['sell_price']
# df_valid = df[df['d'] <= 1913].reset_index(drop=True)
# df_eval = df[df['d'] > 1913].reset_index(drop=True)
# del df, calendar, sell_prices; gc.collect()

# total_sales_valid = df_valid.groupby('id').sum()['total_sales'].values
# total_sales_eval = df_eval.groupby('id').sum()['total_sales'].values

# NUM_ITEMS = len(sales)
# WEIGHTS_MAT_BOOL = np.c_[
#     np.ones([NUM_ITEMS, 1]).astype(int), # Level 1
#     pd.get_dummies(sales.state_id.astype(str), drop_first=False).astype(int).values, # Level 2
#     pd.get_dummies(sales.store_id.astype(str), drop_first=False).astype(int).values, # Level 3
#     pd.get_dummies(sales.cat_id.astype(str), drop_first=False).astype(int).values, # Level 4
#     pd.get_dummies(sales.dept_id.astype(str), drop_first=False).astype(int).values, # Level 5
#     pd.get_dummies(sales.state_id.astype(str) + sales.cat_id.astype(str), drop_first=False).astype(int).values, # Level 6
#     pd.get_dummies(sales.state_id.astype(str) + sales.dept_id.astype(str), drop_first=False).astype(int).values, # Level 7
#     pd.get_dummies(sales.store_id.astype(str) + sales.cat_id.astype(str), drop_first=False).astype(int).values, # Level 8
#     pd.get_dummies(sales.store_id.astype(str) + sales.dept_id.astype(str), drop_first=False).astype(int).values, # Level 9
#     pd.get_dummies(sales.item_id.astype(str), drop_first=False).astype(int).values, # Level 10
#     pd.get_dummies(sales.state_id.astype(str) + sales.item_id.astype(str), drop_first=False).astype(int).values, # Level 11
#     np.identity(NUM_ITEMS) # Level 12
# ].T
# WEIGHTS_MAT_BOOL = csr_matrix(WEIGHTS_MAT_BOOL)

# def get_s(WEIGHTS_MAT_BOOL, sales):
#     WEIGHTS_SALES = WEIGHTS_MAT_BOOL * sales
#     weight1 = np.nanmean(
#         np.diff(WEIGHTS_SALES, axis=1)**2, axis=1
#     )
#     return weight1

# def get_w(WEIGHTS_MAT_BOOL, total_sales):
#     WEIGHTS_TOTAL_SALES = WEIGHTS_MAT_BOOL * total_sales
#     weight2 = 12 * WEIGHTS_TOTAL_SALES / np.sum(WEIGHTS_TOTAL_SALES)
#     return weight2

# DAYS_VALID = [f'd_{i}' for i in range(1886, 1914)]

# S = get_s(WEIGHTS_MAT_BOOL, sales[DAYS_VALID].values)
# W = get_w(WEIGHTS_MAT_BOOL, total_sales_valid)
# del sales; gc.collect()

# S_diff = np.full(len(S), 1e-10)
# SW = W / np.sqrt(S + S_diff)

# def wrmsse(preds, data, s=S, w=W, sw=SW, WEIGHTS_MAT_BOOL=WEIGHTS_MAT_BOOL):
#     y_true = data.get_label()
#     score = np.sum(
#         np.sqrt(
#             np.mean(
#                 np.square(
#                     WEIGHTS_MAT_BOOL * (preds - y_true).reshape(WEIGHTS_MAT_BOOL.shape[1], -1)
#                 ), axis=1
#             )
#         ) * sw
#     ) / 12
#     return 'wrmsse', score, False

# del total_sales_valid, total_sales_eval, df_valid, df_eval; gc.collect()

In [None]:
class M5:
    def __init__(self):
        self.TARGET = 'sales'
        self.START_TRAIN = 1
        self.END_TRAIN = 1913
        self.START_VALID = 1914
        self.END_VALID = 1941
        self.START_TEST = 1942
        self.END_TEST = 1969
        self.P_HORIZON = 28
        self.SEED = 2020
        self.N_CORES = psutil.cpu_count()
        self.INPUT = '/kaggle/input'
        self.ORIGINAL = f'{self.INPUT}/m5-forecasting-accuracy'
        self.BASE = f'{self.INPUT}/m5-simple-fe/grid_part_1.pkl'
        self.PRICE = f'{self.INPUT}/m5-simple-fe/grid_part_2.pkl'
        self.CALENDAR = f'{self.INPUT}/m5-simple-fe/grid_part_3.pkl'
        self.LAGS = f'{self.INPUT}/m5-lags-features/lags_df_28.pkl'
        self.MEAN_ENC = f'{self.INPUT}/m5-custom-features/mean_encoding_df.pkl'
        self.STORES_IDS = list(pd.read_csv(f'{self.ORIGINAL}/sales_train_evaluation.csv')['store_id'].unique())
        self.SHIFT_DAY = 28
        self.N_LAGS = 15
        self.LAGS_SPLIT = [col for col in range(1, self.SHIFT_DAY + self.N_LAGS)]
        self.ROLS_SPLIT = [[i, j] for i in [1, 7, 14] for j in [7, 14, 30, 60]]
        self.mean_features = [
            'enc_cat_id_mean',
            'enc_cat_id_std',
            'enc_dept_id_mean',
            'enc_dept_id_std',
            'enc_item_id_mean',
            'enc_item_id_std',
        ]
        self.feature_columns = [
            'item_id',
            'dept_id',
            'cat_id',
            'release',
            'sell_price',
            'price_max',
            'price_min',
            'price_std',
            'price_mean',
            'price_norm',
            'price_nunique',
            'item_nunique',
            'price_momentum',
            'price_momentum_m',
            'price_momentum_y',
            'event_name_1',
            'event_type_1',
            'event_name_2',
            'event_type_2',
            'snap_CA',
            'snap_TX',
            'snap_WI',
            'tm_d',
            'tm_w',
            'tm_m',
            'tm_y',
            'tm_wm',
            'tm_dw',
            'tm_w_end',
            'enc_cat_id_mean',
            'enc_cat_id_std',
            'enc_dept_id_mean',
            'enc_dept_id_std',
            'enc_item_id_mean',
            'enc_item_id_std',
        ]
        self.lag_columns = [col for col in pd.read_pickle(self.LAGS).columns if 'lag_' in col or 'rolling_' in col]
        
        self.pred_series = [
            [1, 2],
            [3, 4],
            [5, 6],
            [7, 8],
            [9, 10],
            [11, 12],
            [13, 14],
            [15, 16],
            [17, 18],
            [19, 20],
            [21, 22],
            [23, 24],
            [25, 26],
            [27, 28],
        ]

    def seed_everything(self):
        seed = self.SEED
        random.seed(seed)
        np.random.seed(seed)

    def get_data_by_store(self, store_id):
        df = pd.read_pickle(self.BASE).query(f'd >= {self.START_TRAIN}').reset_index(drop=True)
        
        df = df.merge(
            pd.read_pickle(self.PRICE),
            on=['id', 'd']
        ).merge(
            pd.read_pickle(self.CALENDAR),
            on=['id', 'd']
        ).query(f'store_id == "{store_id}"')
        
        df = df.merge(
            pd.read_pickle(self.MEAN_ENC)[['id', 'd'] + self.mean_features],
            how='left',
            on=['id', 'd']
        )
        
        df = df.merge(
            pd.read_pickle(self.LAGS).drop('sales', axis=1),
            how='left',
            on=['id', 'd']
        )
        
        # Christmasの日を落とす
        df = df.query('event_name_1 != "Christmas"').reset_index(drop=True)
        return df

    def get_data(self):
        df = pd.read_pickle(self.BASE).query(f'd >= {self.START_TRAIN}').reset_index(drop=True)
        df = df.merge(
            pd.read_pickle(self.PRICE),
            on=['id', 'd']
        ).merge(
            pd.read_pickle(self.CALENDAR),
            on=['id', 'd']
        )

        df = df.merge(
            pd.read_pickle(self.MEAN_ENC)[['id', 'd'] + self.mean_features],
            how='left',
            on=['id', 'd']
        )

        df = df.merge(
            pd.read_pickle(self.LAGS).drop('sales', axis=1),
            how='left',
            on=['id', 'd']
        )

        # Christmasの日を落とす
        df = df.query('event_name_1 != "Christmas"').reset_index(drop=True)

        df = df.sort_values(['d', 'id']).reset_index(drop=True)

        return df
    def train_test_split(self, df):
        df = df.sort_values(['d', 'id'])
        train_df = df.query(f'd <= {self.END_VALID}').reset_index(drop=True)
        test_df = df.query(f'd >= {self.START_TEST}').reset_index(drop=True)
        return train_df, test_df


In [None]:
class Base_Model(object):
    def __init__(self, train_df, test_df, features, target, seed, wrmsse, n_splits=3, verbose=True):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.verbose = verbose
        self.target = target
        self.seed = seed
        self.params = self.get_params()
        self.wrmsse = wrmsse
        self.y_pred, self.score, self.models = self.fit()

    def train_model(self, train_set, val_set):
        raise NotImplementedError

    def get_params(self):
        raise NotImplementedError

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError

    def convert_x(self, x):
        return x

    def generate_importance_fig(self):
        raise NotImplementedError

    def fit(self):
        y_pred = np.zeros((len(self.test_df), ))

        mlflow.log_param('train_period', self.train_df['d'].max() - self.train_df['d'].min())
        mlflow.log_params(self.params)
        models = []
        
        oof_pred = np.array([])
        oof_true = np.array([])
        for fold in range(1, 1+self.n_splits):
            val_idx = self.train_df.query(f'd >= {1942 - 28*fold} and d <= {1941 - 28*(fold-1)}').index
            train_idx = self.train_df.query(f'd < {1942 - 28*fold}').index

            print(f'FOLD:', fold)
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model = self.train_model(train_set, val_set)
            models.append(model)
            conv_x_val = self.convert_x(x_val)
            fold_pred = model.predict(conv_x_val).reshape(self.train_df[self.target][val_idx].shape)
            oof_pred = np.concatenate((oof_pred, fold_pred), axis=None)
            oof_true = np.concatenate((oof_true, y_val), axis=None)
            x_test = self.convert_x(self.test_df[self.features])
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
            fold_loss_score = self.wrmsse(fold_pred, y_val.values, is_feval=False)
            print(f'Partial score of fold {fold} is:', fold_loss_score)
            del x_train, x_val, y_train, y_val, train_set, val_set, model, conv_x_val, x_test; gc.collect()

        loss_score = self.wrmsse(oof_true, oof_pred, is_feval=False)
        print('Our oof wrmsse score is: ', loss_score)
        mlflow.log_metric('wrmsse', loss_score)
        self.generate_importance_fig(models, 'importance')
        return y_pred, loss_score, models

class Lgb_Model(Base_Model):
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(
            self.params,
            train_set,
            num_boost_round=5000,
            early_stopping_rounds=100,
            valid_sets=[train_set, val_set],
            verbose_eval=verbosity,
            feval=self.wrmsse
        )

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        return train_set, val_set

    def get_params(self):
        params = {
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'custom',
            'subsample': 0.5,
            'subsample_freq': 1,
            'learning_rate': 0.01,
            'num_leaves': 2**11-1,
            'min_data_in_leaf': 2**12-1,
            'feature_fraction': 0.5,
            'max_bin': 100,
            'boost_from_average': False,
            'verbose': -1,
        }
        params['seed'] = self.seed
        return params

    def generate_importance_fig(self, models, fig_path):
        plt.figure(figsize=(12, 30))
        plt.tight_layout()
        importance = pd.DataFrame()
        importance['column'] = models[0].feature_name()
        importance['importance'] = sum([m.feature_importance('gain') for m in models])
        importance = importance.sort_values('importance', ascending=False)
        sns.barplot(x='importance', y='column', data=importance)
        plt.savefig(f'{fig_path}_gain.png', bbox_inches="tight")
        mlflow.log_artifact(f'{fig_path}_gain.png')

        plt.figure(figsize=(12, 30))
        plt.tight_layout()
        importance = pd.DataFrame()
        importance['column'] = models[0].feature_name()
        importance['importance'] = sum([m.feature_importance('split') for m in models])
        importance = importance.sort_values('importance', ascending=False)
        sns.barplot(x='importance', y='column', data=importance)
        plt.savefig(f'{fig_path}_split.png', bbox_inches="tight")
        mlflow.log_artifact(f'{fig_path}_split.png')

In [None]:
m5 = M5()

In [None]:
def convert_submit_data(row):
    if row['d'] >= m5.START_VALID and row['d'] <= m5.END_VALID:
        row['id'] = row['id'].replace('evaluation', 'validation')
        row['d'] = f"F{row['d'] - m5.END_TRAIN}"
    elif row['d'] >= m5.START_TEST and row['d'] <= m5.END_TEST:
        row['id'] = row['id'].replace('validation', 'evaluation')
        row['d'] = f"F{row['d'] - m5.END_VALID}"
    else:
        raise ValueError()
    return row

def get_train_id(threshold, id_frequency):
    return id_frequency[id_frequency >= threshold].index.values.astype(str)

mlflow.set_experiment('WRMSSE')
df = m5.get_data()
mlflow.start_run()
threshold = 1000
mlflow.log_param('threshold', threshold)
id_frequency = df['id'].value_counts()
train_id = get_train_id(threshold, id_frequency)
df = df[df['id'].isin(train_id)].query(f'd >= {m5.END_TEST + 1 - threshold}').reset_index(drop=True)

submission = pd.DataFrame(columns=pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv').columns)

## calculate wrmsse----------------------------------------
sales = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv').drop([f'd_{i}' for i in range(1, 1914)], axis=1)
sales = sales[sales['id'].isin(train_id)].reset_index(drop=True)
sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')

sales = sales.sort_values('id').reset_index(drop=True)
data = pd.melt(sales, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'd', value_name = 'sales')
data = data.sort_values(['d', 'id'])
data = data.merge(calendar[['d', 'wm_yr_wk']], on=['d']).merge(sell_prices, on=['item_id', 'store_id', 'wm_yr_wk'])
data = data.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'wm_yr_wk'], axis=1)
data['d'] = data['d'].apply(lambda x: x.split('_')[-1]).astype(int)
data['total_sales'] = data['sales'] * data['sell_price']
df_valid = data[data['d'] <= 1941].reset_index(drop=True)
df_eval = data[data['d'] > 1941].reset_index(drop=True)
del data, calendar, sell_prices; gc.collect()

total_sales_valid = df_valid.groupby('id').sum()['total_sales'].values
total_sales_eval = df_eval.groupby('id').sum()['total_sales'].values

NUM_ITEMS = len(sales)
WEIGHTS_MAT_BOOL = np.c_[
    np.ones([NUM_ITEMS, 1]).astype(int), # Level 1
    pd.get_dummies(sales.state_id.astype(str), drop_first=False).astype(int).values, # Level 2
    pd.get_dummies(sales.store_id.astype(str), drop_first=False).astype(int).values, # Level 3
    pd.get_dummies(sales.cat_id.astype(str), drop_first=False).astype(int).values, # Level 4
    pd.get_dummies(sales.dept_id.astype(str), drop_first=False).astype(int).values, # Level 5
    pd.get_dummies(sales.state_id.astype(str) + sales.cat_id.astype(str), drop_first=False).astype(int).values, # Level 6
    pd.get_dummies(sales.state_id.astype(str) + sales.dept_id.astype(str), drop_first=False).astype(int).values, # Level 7
    pd.get_dummies(sales.store_id.astype(str) + sales.cat_id.astype(str), drop_first=False).astype(int).values, # Level 8
    pd.get_dummies(sales.store_id.astype(str) + sales.dept_id.astype(str), drop_first=False).astype(int).values, # Level 9
    pd.get_dummies(sales.item_id.astype(str), drop_first=False).astype(int).values, # Level 10
    pd.get_dummies(sales.state_id.astype(str) + sales.item_id.astype(str), drop_first=False).astype(int).values, # Level 11
    np.identity(NUM_ITEMS) # Level 12
].T
WEIGHTS_MAT_BOOL = csr_matrix(WEIGHTS_MAT_BOOL)
print('WEIGHTS_MAT_BOOL', WEIGHTS_MAT_BOOL.shape)

def get_s(WEIGHTS_MAT_BOOL, sales):
    WEIGHTS_SALES = WEIGHTS_MAT_BOOL * sales
    weight1 = np.nanmean(
        np.diff(WEIGHTS_SALES, axis=1)**2, axis=1
    )
    return weight1

def get_w(WEIGHTS_MAT_BOOL, total_sales):
    WEIGHTS_TOTAL_SALES = WEIGHTS_MAT_BOOL * total_sales
    weight2 = 12 * WEIGHTS_TOTAL_SALES / np.sum(WEIGHTS_TOTAL_SALES)
    return weight2

DAYS_VALID = [f'd_{i}' for i in range(1914, 1941)]

S = get_s(WEIGHTS_MAT_BOOL, sales[DAYS_VALID].values)
W = get_w(WEIGHTS_MAT_BOOL, total_sales_valid)
del sales; gc.collect()

S_diff = np.full(len(S), 1e-10)
SW = W / np.sqrt(S + S_diff)

def wrmsse(preds, data, s=S, w=W, sw=SW, WEIGHTS_MAT_BOOL=WEIGHTS_MAT_BOOL, is_feval=True):
    if is_feval:
        y_true = data.get_label()
    else:
        preds = preds.values if type(preds) == pd.core.series.Series else preds
        y_true = data
        y_true = y_true.values if type(y_true) == pd.core.series.Series else y_true

    score = np.sum(
        np.sqrt(
            np.mean(
                np.square(
                    WEIGHTS_MAT_BOOL * (preds - y_true).reshape(WEIGHTS_MAT_BOOL.shape[1], -1)
                ), axis=1
            )
        ) * sw
    ) / 12
    
    if is_feval:
        return 'wrmsse', score, False
    else:
        return score

del total_sales_valid, total_sales_eval, df_valid, df_eval; gc.collect()
## calculate wrmsse----------------------------------------

train_df, test_df = m5.train_test_split(df=df)
remove_features = []
for col in m5.lag_columns:
    if 'sales_lag' in col and test_df[col].isnull().sum() > 0:
        remove_features.append(col)
        print(col)
features = m5.feature_columns + [col for col in m5.lag_columns if col not in remove_features]

#  trainで欠損値がある行を落とす
print('train_df shape is', train_df.shape)
for c in tqdm(m5.lag_columns + ['sell_price']):
    if c in features and ('sales_lag' in c or 'rolling_mean' in c):
        train_df = train_df[train_df[c].notnull()].reset_index(drop=True)
        print('drop NaN row...', c, train_df.shape)

print(features)

lower_limit = train_df[['id', 'd']].groupby('id').min()['d'].max()
print('lower_limit', lower_limit)
mlflow.log_param('lower_limit', lower_limit)
train_df = train_df.query(f'd >= {lower_limit}').reset_index(drop=True)

m5.seed_everything()

lgb_model = Lgb_Model(
    train_df=train_df,
    test_df=test_df,
    features=features,
    target=m5.TARGET,
    seed=m5.SEED,
    wrmsse=wrmsse,
)

test_df['pred'] = lgb_model.y_pred
test_df = test_df[['id', 'd', 'pred']].apply(convert_submit_data, axis=1)
submission = submission.append(pd.pivot(test_df, columns='d', index='id', values='pred').reset_index())
submission.to_csv('./submission.csv', index=False)

mlflow.log_artifact('./submission.csv')
mlflow.end_run()

In [None]:
submission