In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import os, sys, gc, time, warnings, pickle, psutil, random
from multiprocessing import Pool
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
pd.set_option('display.max_columns', 500)
mlflow.set_tracking_uri('http://mlflow:5000')

In [2]:
class M5:
    def __init__(self, START_TRAIN):
        self.TARGET = 'sales'
        self.START_TRAIN = START_TRAIN
        self.END_TRAIN = 1913
        self.START_VALID = 1914
        self.END_VALID = 1941
        self.START_TEST = 1942
        self.END_TEST = 1969
        self.P_HORIZON = 28
        self.SEED = 2020
        self.N_CORES = psutil.cpu_count()
        self.INPUT = '/kaggle/input'
        self.ORIGINAL = f'{self.INPUT}/m5-forecasting-accuracy'
        self.BASE = f'{self.INPUT}/m5-simple-fe/grid_part_1.pkl'
        self.PRICE = f'{self.INPUT}/m5-simple-fe/grid_part_2.pkl'
        self.CALENDAR = f'{self.INPUT}/m5-simple-fe/grid_part_3.pkl'
        self.LAGS = f'{self.INPUT}/m5-lags-features/lags_df_28.pkl'
        self.MEAN_ENC = f'{self.INPUT}/m5-custom-features/mean_encoding_df.pkl'
        self.STORES_IDS = list(pd.read_csv(f'{self.ORIGINAL}/sales_train_evaluation.csv')['store_id'].unique())
        self.SHIFT_DAY = 28
        self.N_LAGS = 15
        self.LAGS_SPLIT = [col for col in range(1, self.SHIFT_DAY + self.N_LAGS)]
        self.ROLS_SPLIT = [[i, j] for i in [1, 7, 14] for j in [7, 14, 30, 60]]
        self.mean_features = [
            'enc_cat_id_mean',
            'enc_cat_id_std',
            'enc_dept_id_mean',
            'enc_dept_id_std',
            'enc_item_id_mean',
            'enc_item_id_std',
        ]
        self.feature_columns = [
            'item_id',
            'dept_id',
            'cat_id',
            'release',
            'sell_price',
            'price_max',
            'price_min',
            'price_std',
            'price_mean',
            'price_norm',
            'price_nunique',
            'item_nunique',
            'price_momentum',
            'price_momentum_m',
            'price_momentum_y',
            'event_name_1',
            'event_type_1',
            'event_name_2',
            'event_type_2',
            'snap_CA',
            'snap_TX',
            'snap_WI',
            'tm_d',
            'tm_w',
            'tm_m',
            'tm_y',
            'tm_wm',
            'tm_dw',
            'tm_w_end',
            'enc_cat_id_mean',
            'enc_cat_id_std',
            'enc_dept_id_mean',
            'enc_dept_id_std',
            'enc_item_id_mean',
            'enc_item_id_std',
        ]
        self.lag_columns = [col for col in pd.read_pickle(self.LAGS).columns if 'lag_' in col or 'rolling_' in col]

    def seed_everything(self):
        seed = self.SEED
        random.seed(seed)
        np.random.seed(seed)

    def get_data_by_store(self, store_id):
        df = pd.read_pickle(self.BASE).query(f'd >= {self.START_TRAIN}').reset_index(drop=True)
        
        df = df.merge(
            pd.read_pickle(self.PRICE),
            on=['id', 'd']
        ).merge(
            pd.read_pickle(self.CALENDAR),
            on=['id', 'd']
        ).query(f'store_id == "{store_id}"')
        
        df = df.merge(
            pd.read_pickle(self.MEAN_ENC)[['id', 'd'] + self.mean_features],
            how='left',
            on=['id', 'd']
        )
        
        df = df.merge(
            pd.read_pickle(self.LAGS).drop('sales', axis=1),
            how='left',
            on=['id', 'd']
        )
        
        # Christmasの日を落とす
        df = df.query('event_name_1 != "Christmas"').reset_index(drop=True)
        return df

    def get_data(self):
        df = pd.read_pickle(self.BASE).query(f'd >= {self.START_TRAIN}').reset_index(drop=True)
        df = df.merge(
            pd.read_pickle(self.PRICE),
            on=['id', 'd']
        ).merge(
            pd.read_pickle(self.CALENDAR),
            on=['id', 'd']
        )

        df = df.merge(
            pd.read_pickle(self.MEAN_ENC)[['id', 'd'] + self.mean_features],
            how='left',
            on=['id', 'd']
        )

        df = df.merge(
            pd.read_pickle(self.LAGS).drop('sales', axis=1),
            how='left',
            on=['id', 'd']
        )

        # Christmasの日を落とす
        df = df.query('event_name_1 != "Christmas"').reset_index(drop=True)

        df = df.sort_values(['d', 'id']).reset_index(drop=True)

        return df
    def train_test_split(self, df):
        df = df.sort_values(['d', 'id'])
        train_df = df.query(f'd <= {self.END_VALID}').reset_index(drop=True)
        test_df = df.query(f'd > {self.END_VALID}').reset_index(drop=True)
        return train_df, test_df

In [3]:
class Base_Model(object):
    def __init__(self, train_df, test_df, features, target, seed, n_splits=3, verbose=True):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.verbose = verbose
        self.target = target
        self.seed = seed
        self.params = self.get_params()
        self.y_pred, self.score, self.model = self.fit()

    def train_model(self, train_set, val_set):
        raise NotImplementedError

    def get_params(self):
        raise NotImplementedError

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError

    def convert_x(self, x):
        return x

    def generate_importance_fig(self):
        raise NotImplementedError

    def fit(self):
        y_pred = np.zeros((len(self.test_df), ))

        mlflow.log_param('train_period', self.train_df['d'].max() - self.train_df['d'].min())
        mlflow.log_params(self.params)
        models = []
        
        oof_pred = np.array([])
        oof_true = np.array([])
        for fold in range(1, 1+self.n_splits):
            val_idx = self.train_df.query(f'd >= {1942 - 28*fold} and d <= {1941 - 28*(fold-1)}').index
            train_idx = self.train_df.query(f'd < {1942 - 28*fold}').index

            print(f'FOLD:', fold)
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model = self.train_model(train_set, val_set)
            models.append(model)
            conv_x_val = self.convert_x(x_val)
            fold_pred = model.predict(conv_x_val).reshape(self.train_df[self.target][val_idx].shape)
            oof_pred = np.concatenate((oof_pred, fold_pred), axis=None)
            oof_true = np.concatenate((oof_true, y_val), axis=None)
            x_test = self.convert_x(self.test_df[self.features])
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
            fold_loss_score = np.sqrt(mean_squared_error(y_val, oof_pred[val_idx]))
            print(f'Partial score of fold {fold} is:', fold_loss_score)
            del x_train, x_val, y_train, y_val, train_set, val_set, model, conv_x_val, x_test; gc.collect()

        loss_score = np.sqrt(mean_squared_error(oof_true, oof_pred))
        print('Our oof rmse score is: ', loss_score)
        mlflow.log_metric('rmse', loss_score)
        self.generate_importance_fig(models, 'importance')
        return y_pred, loss_score, models

class Lgb_Model(Base_Model):
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(
            self.params,
            train_set,
            num_boost_round=10000,
            early_stopping_rounds=100,
            valid_sets=[train_set, val_set],
            verbose_eval=verbosity,
        )

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        return train_set, val_set

    def get_params(self):        
        params = {
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'subsample': 0.5,
            'subsample_freq': 1,
            'learning_rate': 0.005,
            'num_leaves': 2**11-1,
            'min_data_in_leaf': 2**12-1,
            'feature_fraction': 0.5,
            'max_bin': 100,
            'boost_from_average': False,
            'verbose': -1,
        }

        params['seed'] = self.seed
        return params

    def generate_importance_fig(self, models, fig_path):
        plt.figure(figsize=(12, 30))
        plt.tight_layout()
        importance = pd.DataFrame()
        importance['column'] = models[0].feature_name()
        importance['importance'] = sum([m.feature_importance('gain') for m in models])
        importance = importance.sort_values('importance', ascending=False)
        sns.barplot(x='importance', y='column', data=importance)
        plt.savefig(f'{fig_path}_gain.png', bbox_inches="tight")
        mlflow.log_artifact(f'{fig_path}_gain.png')

        plt.figure(figsize=(12, 30))
        plt.tight_layout()
        importance = pd.DataFrame()
        importance['column'] = models[0].feature_name()
        importance['importance'] = sum([m.feature_importance('split') for m in models])
        importance = importance.sort_values('importance', ascending=False)
        sns.barplot(x='importance', y='column', data=importance)
        plt.savefig(f'{fig_path}_split.png', bbox_inches="tight")
        mlflow.log_artifact(f'{fig_path}_split.png')

In [None]:
def convert_submit_data(row):
    if row['d'] >= m5.START_VALID and row['d'] <= m5.END_VALID:
        row['id'] = row['id'].replace('evaluation', 'validation')
        row['d'] = f"F{row['d'] - m5.END_TRAIN}"
    elif row['d'] >= m5.START_TEST and row['d'] <= m5.END_TEST:
        row['id'] = row['id'].replace('validation', 'evaluation')
        row['d'] = f"F{row['d'] - m5.END_VALID}"
    else:
        raise ValueError()
    return row

mlflow.set_experiment('RMSE')

START_TRAIN = 1
m5 = M5(START_TRAIN=START_TRAIN)

pred_df = pd.DataFrame()

df = m5.get_data()
current_train_df, test_df = m5.train_test_split(df=df)

for day in tqdm(range(m5.START_TEST, m5.END_TEST+1)):
    print('day', day)
    
    submission = pd.DataFrame(columns=pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv').columns)
    print('Start Train:', START_TRAIN)
    current_test_df = test_df.query(f'd == {day}').reset_index(drop=True)
    remove_features = []

    for col in m5.lag_columns:
        if current_test_df[col].isnull().sum() > 0:
            remove_features.append(col)
            print('remove_features', col)
    features = m5.feature_columns + [col for col in m5.lag_columns if col not in remove_features]

    current_train_df = current_train_df.drop(remove_features, axis=1)
    current_test_df = current_test_df.drop(remove_features, axis=1)

#     trainで欠損値がある行を落とす
    for c in tqdm(m5.lag_columns + ['sell_price']):
        if c in features and current_train_df[col].isnull().sum() > 0:
            current_train_df = current_train_df[current_train_df[c].notnull()].reset_index(drop=True)
            print('drop NaN row...', c, current_train_df.shape)

    print(features)
    m5.seed_everything()

    mlflow.start_run()
    mlflow.log_param('START_TRAIN', START_TRAIN)
    mlflow.log_param('PRED_DAY', day)

    lgb_model = Lgb_Model(
        train_df=current_train_df,
        test_df=current_test_df,
        features=features,
        target=m5.TARGET,
        seed=m5.SEED,
    )

    pred_df['id'] = current_test_df['id'].drop_duplicates()
    pred_df[f'd_{day}'] = lgb_model.y_pred
#     current_test_df = current_test_df[['id', 'd', 'pred']].apply(convert_submit_data, axis=1)
#     submission = submission.append(pd.pivot(current_test_df, columns='d', index='id', values='pred').reset_index())
#     submission.to_csv('./submission.csv', index=False)
    pred_df.to_csv('./pred_df.csv', index=False)

    mlflow.log_artifact('./pred_df.csv')
    mlflow.end_run()
    del m5, df, current_test_df, lgb_model; gc.collect()

HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

day 1942
Start Train: 1
remove_features rolling_mean_180
remove_features rolling_std_180


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))

drop NaN row... sales_lag_1 (46726819, 202)
drop NaN row... sales_lag_2 (46696377, 202)
drop NaN row... sales_lag_3 (46665887, 202)
drop NaN row... sales_lag_4 (46635437, 202)
drop NaN row... sales_lag_5 (46604998, 202)
drop NaN row... sales_lag_6 (46574546, 202)
drop NaN row... sales_lag_7 (46544056, 202)
drop NaN row... sales_lag_8 (46513566, 202)
drop NaN row... sales_lag_9 (46483107, 202)
drop NaN row... sales_lag_10 (46452617, 202)
drop NaN row... sales_lag_11 (46422177, 202)
drop NaN row... sales_lag_12 (46391743, 202)
drop NaN row... sales_lag_13 (46361281, 202)
drop NaN row... sales_lag_14 (46330791, 202)
drop NaN row... sales_lag_15 (46300301, 202)
drop NaN row... sales_lag_16 (46269837, 202)
drop NaN row... sales_lag_17 (46239347, 202)
drop NaN row... sales_lag_18 (46208930, 202)
drop NaN row... sales_lag_19 (46178472, 202)
drop NaN row... sales_lag_20 (46148014, 202)
drop NaN row... sales_lag_21 (46117527, 202)
drop NaN row... sales_lag_22 (46087037, 202)
drop NaN row... sal