## Overview

Challenge link: https://competitions4dev.org/forecastingprize

## Model Summary

Average ensemble of six LightGBM models to predict `stock_distributed` for t+1, t+2, and t+3

* Cross-validation: 4x time series CV
  * Block 43: Jul-Sep 2019
  * Block 40: Apr-Jun 2019
  * Block 37: Jan-Mar 2019
  * Block 34: Oct-Dec 2018
* Features engineering
  * Lag t-1, t-2, t-3, t-4
  * Longitude, latitude
  * Categorical features: product, region, type
* Modeling
  * Optimize MSE, not RMSE since the evaluationn metrics is MASE (MAE divided by a constant)
  * Full training using 1000 rounds
  * Learning rate is 0.025

In [1]:
# Main package
import numpy as np
import pandas as pd
import lightgbm as lgb

# Utility
from itertools import combinations, product
import random
import calendar
from pandas.tseries.offsets import MonthEnd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 200)

In [2]:
# Default parameters
TARGET = 'stock_distributed'
SEED = 2020
categorical_features = ['site_code',
                        'product_code',
                        'region',
                        'district',
                        'site_type',
                        'product_type']
remove_features = ['stock_initial', 'stock_received', 'stock_adjustment', 'stock_distributed',
                   'stock_end', 'average_monthly_consumption', 'stock_stockout_days',
                   'stock_ordered', 'ds', 'isna', 'idx', 'product_name']
numerical_features = ['stock_initial', 'stock_received', 'stock_adjustment', 'stock_distributed',
                      'stock_end', 'average_monthly_consumption', 'stock_ordered']


# Utility
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    

# Main functions
def process_data(features = [TARGET], test_block = 43, lag = [1,2,3,4]):
    '''
    Process data for main features engineering
    '''
    df = _read_data(features = features)
    df = _generate_test_set(df)
    df = _get_cumulative_nonzero(df)
    df = generate_lag_features(df, lag = lag, features = features)
    return df

def get_mase_constant_agg(test_block = [46,43,40,37,34]):
    '''
    Generate MASE constant (denominator) for each CV block
    '''
    df = _read_data()
    df = _get_cumulative_nonzero(df)
    summary_block = pd.DataFrame({})
    for block in test_block:
        df_block = _get_mase_constant(df, block)
        df_block = df[['site_code','product_code','mase_constant']].drop_duplicates()
        df_block = pd.concat([df_block.assign(idx = block),
                              df_block.assign(idx = block + 1),
                              df_block.assign(idx = block + 2)])
        summary_block = summary_block.append(df_block)
    summary_block = summary_block.sort_values(by = ['site_code','product_code','idx']).reset_index(drop=True)
    return summary_block

def process_train_cv(df, verbose = 500, is_print = False,
                     use_log = False, use_weight = False,
                     remove_first_na = False, remove_first_zero = False,
                     cv_block = [43,40,37,34], full_train = True,
                     version = 0,
                     use_separate_model = False,
                     use_id = False,
                     use_diff = False,
                     use_weekend = False,
                     use_month = False,
                     use_quarter = False,
                     use_year = False):
    
    if use_id:
        df = generate_id(df)
    if use_diff:
        df = generate_diff_features(df)
    if use_weekend:
        df = get_weekend_in_month(df, use_percentage=True)
    if use_month:
        df = generate_date_features(df, use_month=True)
    if use_quarter:
        df = generate_date_features(df, use_quarter=True, use_month=False)
    if use_year:
        df = generate_date_features(df, use_year=True, use_month=False)
    
    cv = []
    cv_mae = []
    cv_rmse = []
    pred_overall = pd.DataFrame({})
    if full_train:
        cv_block = cv_block + [46]
    
    for test_block in cv_block:
        mdl, pred = process_train(df.loc[:, ~df.columns.str.contains('lag_(1|2)', case=False)], 
                                  test_block = test_block, verbose = verbose, is_print = is_print,
                                  use_log = use_log, use_weight = use_weight,
                                  remove_first_na = remove_first_na, remove_first_zero = remove_first_zero)
        if use_separate_model:
            lgb_params.update({'lambda_l2': 0.1})
            mdl2, pred2 = process_train(df.loc[:, ~df.columns.str.contains('lag_1', case=False)], 
                                        test_block = test_block, verbose = verbose, is_print = is_print,
                                        use_log = use_log, use_weight = use_weight,
                                        remove_first_na = remove_first_na, remove_first_zero = remove_first_zero)
            mdl3, pred3 = process_train(df,
                                        test_block = test_block, verbose = verbose, is_print = is_print,
                                        use_log = use_log, use_weight = use_weight,
                                        remove_first_na = remove_first_na, remove_first_zero = remove_first_zero)
            pred = pd.concat([pred[pred['idx'] == test_block+2], 
                              pred2[pred2['idx'] == test_block+1],
                              pred3[pred3['idx'] == test_block]], axis=0)
            pred['block'] = test_block
        
        pred_overall = pred_overall.append(pred, ignore_index=True)
        if test_block <= 43:
            cv.append(mase_df(pred))
            cv_mae.append(mae(pred.stock_distributed, np.where(pred.preds < 0, 0, pred.preds)))
            cv_rmse.append(rmse(pred.stock_distributed, np.where(pred.preds < 0, 0, pred.preds)))
    
    print('CV details is {}'.format([round(val, 4) for val in cv]))
    print('CV-1 is {:.4f}, CV mean is {:.4f} and CV std is {:.4f}'.format(cv[0], np.array(cv).mean(), np.array(cv).std()))
    
    print('MASE-CV details is {}'.format([round(val, 4) for val in cv_mae]))
    print('MASE-CV-1 is {:.4f}, mean is {:.4f} and std is {:.4f}'.format(cv_mae[0], np.array(cv_mae).mean(), np.array(cv_mae).std()))
    
    
    if use_separate_model:
        m = 'individual'
    else:
        m = 'base'
    version = str(version)
    
    pred_overall.to_csv(f'data/temp/lgb_v{version}_{m}_pred.csv', index=False)
    return pred_overall


# Features Engineering
def _read_data(features = [TARGET], path='data/'):
    df = _combine_data(path=path)
    df[features] = df[features].fillna(0)
    print('Read data, data frame size: {}'.format(df.shape))
    return df

def _combine_data(path='data/'):
    # Read raw data
    train = pd.read_csv(path+'contraceptive_logistics_data.csv')
    location = pd.read_csv(path+'service_delivery_site_data.csv')
    product = pd.read_csv(path+'product.csv')

    # Expand data frame
    month_year = train[['year','month']].drop_duplicates().reset_index(drop=True)
    product_site = train[['region','district','site_code','product_code']].drop_duplicates().reset_index(drop=True)
    train_base = pd.merge(month_year.assign(j=1), product_site.assign(j=1)).drop(columns = 'j')
    train = pd.merge(train_base, train, how='left')

    # Add date and index
    train['day'] = 1
    train['ds'] = pd.to_datetime(train[['year','month','day']])
    train = train.sort_values(by=['site_code','product_code','ds']).reset_index(drop=True)
    train['isna'] = train['stock_distributed'].isna()
    train['idx'] = train.groupby(['site_code','product_code'])['ds'].rank(method='first', ascending=True)
    train = train.drop(columns = ['year','month','day'])

    # Join with location
    train = pd.merge(train, location.drop(columns=['site_region','site_district']))

    # Join with product
    train = pd.merge(train, product)

    # Rearrange columns
    train = train[['site_code','product_code'] + train.drop(columns=['site_code','product_code']).columns.tolist()]

    # Change category
    train = train.sort_values(by=['site_code','product_code','ds']).reset_index(drop=True)
    train['idx'] = train['idx'].astype(int)
    train['ds'] = train['ds'].dt.date.astype(str)
    train['product_name'] = train['product_name'].str.strip()
    
    return(train)

def _generate_test_set(df):
    df_test = pd.DataFrame({})
    for i, dt in enumerate(['2019-10-01','2019-11-01','2019-12-01']):
        test_set = df[df['idx'] == 45].reset_index(drop=True)
        test_set['idx'] = test_set['idx'] + i + 1
        test_set['ds'] = dt
        test_set[['stock_initial','stock_received','stock_distributed',
                  'stock_adjustment','stock_end','average_monthly_consumption',
                  'stock_stockout_days','stock_ordered']] = np.inf
        df_test = df_test.append(test_set)
    df = df.append(df_test).sort_values(by = ['site_code','product_code','idx']).reset_index(drop=True)
    return df

def _get_cumulative_nonzero(df):
    '''
    This function needs to be used before any data removal 
    because of lagging or rolling features.
    Exclude first NA or zero data by df.loc[df['isna_int'] > 0] or df.loc[df['iszero_int'] > 0].shape
    '''
    df['isna_int'] = [0 if x == True else 1 for x in df['isna']]
    df['iszero_int'] = [0 if x == 0 else 1 for x in df['stock_distributed']]
    df[['isna_int', 'iszero_int']] = df.groupby(['site_code', 'product_code'])[['isna_int', 'iszero_int']].transform(lambda x: x.cumsum())
    print('Get cumulative nonzero flag')
    return df

def _get_mase_constant(df, test_block = 46, remove_first_na = True, remove_first_zero = False):
    '''
    This function needs to be used by applying `get_cumulative_nonzero` 
    to exclude first NA or zero data.
    It also needs to be used before any data removal
    The default test_block is 46 ( data) which will be used as the  (43 for latest CV)
    constant of the mase denominator for each series.
    In default, remove first NA data from the training set
    '''
    df['diff_abs'] = df.loc[(df['isna_int'] > 0) & (df['idx'] < test_block)].groupby(['site_code', 'product_code'])['stock_distributed'].transform(lambda x: abs(x-x.shift(1)))
    df['mase_constant'] = df.groupby(['site_code', 'product_code'])['diff_abs'].transform(lambda x: x.mean())
    df['mase_constant'] = 1 / df['mase_constant']
    df['mase_constant'] = df['mase_constant'].replace(np.inf, 0).replace(np.nan, 0)
    print('Get MASE constant')
    return df

def generate_lag_features(df, lag = [3,4], features = [TARGET]):
    df = df.assign(**{
            '{}_lag_{}'.format(col, l): df.groupby(['site_code', 'product_code'])[col].transform(lambda x: x.shift(l))
            for l in lag
            for col in features
         })
    lag_features = [col for col in df.columns if 'lag' in col]
    df = df.dropna(subset = lag_features)
    print('Generate lag features {}, data frame size: {}'.format(lag, df.shape))
    return df 

def generate_diff_features(df, minus = True, ratio = False):
    lag_features = [col for col in df.columns if 'lag' in col]
    for i,j in combinations(lag_features, 2):
        if minus:
            df['{}_minus_{}'.format(i, j)] = df[i] - df[j]
        if ratio:
            df['{}_div_{}'.format(i, j)] = (df[i] / df[j]).fillna(0)
    print('Generate diff features')
    return df

def generate_id(df):
    df['id'] = df['site_code'] + '-' + df['product_code']
    df['id'] = df['id'].astype('category')
    print('Generate ID features')
    return df

def generate_date_features(df, use_month = True, use_quarter = False, use_year = False, use_category = True):
    '''
    Generate date features as category or integer, consists of:
    month, quarter and year
    '''
    if use_month:
        df['month'] = pd.to_datetime(df['ds']).dt.month
    if use_quarter:
        df['quarter'] = pd.to_datetime(df['ds']).dt.quarter
    if use_year:
        df['year'] = pd.to_datetime(df['ds']).dt.year
    date_features = df.filter(regex = '^(month|quarter|year)$').columns.tolist()
    if use_category:
        df[date_features] = df[date_features].astype('category')
    print('Generate date features {}'.format(date_features))
    return df

def get_weekend_in_month(df, use_percentage = False):
    df['weekend_in_month'] = pd.to_datetime(df['ds']).dt.days_in_month - np.busday_count(
        pd.to_datetime(df['ds']).dt.date.values.astype('datetime64[D]'), 
        (pd.to_datetime(df['ds']).dt.date + pd.DateOffset(months=1)).values.astype('datetime64[D]') 
    )
    if use_percentage:
        df['weekend_in_month'] = df['weekend_in_month'] / pd.to_datetime(df['ds']).dt.days_in_month
    print('Get number of weekend days in month')
    return df

def get_day_in_month(df):
    df['day_in_month'] = pd.to_datetime(df['ds']).dt.daysinmonth
    print('Get days in month')
    return df

def remove_unnecessary_columns(df, column_list = []):
    '''
    Remove columns generated from features engineering process outside of 
    list from `remove_features`
    '''
    column_list_all = ['diff_abs'] + column_list
    column_list_selected = list(set(column_list_all) & set(df.columns.tolist()))
    df = df.drop(column_list_selected, axis = 1)
    print('Remove unnecessary columns')
    return df


# Error function
def rmse(y, y_pred):
    return np.sqrt(np.mean(np.square(y - y_pred)))

def mae(y, y_pred):
    return np.mean(np.abs(y - y_pred))

def mase_df(pred_df, clip_lower = True):
    pred_df = pd.merge(pred_df, summary_block[['site_code', 'product_code', 'idx', 'mase_constant']])
    if clip_lower:
        pred_df[['preds']] = pred_df[['preds']].clip(lower = 0)
    pred_df['scaled_error'] = abs(pred_df['stock_distributed'] - pred_df['preds']) * pred_df['mase_constant']
    mase = pred_df.groupby(['site_code', 'product_code'])['scaled_error'].agg(lambda x: x.mean()).mean()
    return mase

def mae_row(pred_df, clip_lower = True):
    if clip_lower:
        pred_df[['preds']] = pred_df[['preds']].clip(lower = 0)
    return(mae(pred_df.preds, pred_df.stock_distributed))


# Modeling
def process_train(df, test_block = 43, verbose = 500, is_print = True,
                  use_log = False, use_weight = False,
                  remove_first_na = False, remove_first_zero = False,
                  version = 0):
    
    df = df.copy()
    local_params = lgb_params.copy()           
        
    if use_log:
        df[TARGET] = np.log1p(df[TARGET])

    # Categorical feature
    for col in categorical_features:
        try:
            df[col] = df[col].astype('category')
        except:
            pass
    
    # Our features
    remove_additional_features = ['isna_int', 'iszero_int', 'mase_constant', 'diff_abs']
    remove_additional_features_selected = list(set(remove_additional_features) & set(df.columns.tolist()))
    all_features = [col for col in list(df) if col not in (remove_features + remove_additional_features_selected)]
    if is_print: print(all_features)
        
    # Check lag
    if len([col for col in all_features if 'lag_1' in col]) > 0:
        block_next = 1
    elif len([col for col in all_features if 'lag_2' in col]) > 0:
        block_next = 2
    else:
        block_next = 3
 
    if remove_first_na:
        train_mask = (df['idx']<test_block) & (df['isna_int']>0)
    elif remove_first_zero:
        train_mask = (df['idx']<test_block) & (df['iszero_int']>0)
    else:
        train_mask = df['idx']<test_block
    valid_mask = (df['idx'].isin(range(test_block,test_block + block_next))) & (df['isna'] == False)
    
    if use_weight:
        train_data = lgb.Dataset(df[train_mask][all_features], label=df[train_mask][TARGET], weight=df[train_mask]['mase_constant'])
        valid_data = lgb.Dataset(df[valid_mask][all_features], label=df[valid_mask][TARGET], weight=df[valid_mask]['mase_constant'])
    else:
        train_data = lgb.Dataset(df[train_mask][all_features], label=df[train_mask][TARGET])
        valid_data = lgb.Dataset(df[valid_mask][all_features], label=df[valid_mask][TARGET])
    
    print('Train data frame size: ({}, {})'.format(len(train_mask[train_mask]), len(all_features)))
    print('Train time block', df[train_mask]['idx'].min(), df[train_mask]['idx'].max())
    if is_print: 
        print('Valid time block', df[valid_mask]['idx'].min(), df[valid_mask]['idx'].max())

    temp_df = df[valid_mask]
    del df
    seed_everything(SEED)
    if test_block != 46:
        estimator = lgb.train(local_params,
                              train_data,
                              valid_sets = [valid_data],
                              verbose_eval = verbose) 
    else:
        if 'early_stopping_rounds' in local_params: 
            del local_params['early_stopping_rounds']
        estimator = lgb.train(local_params,
                              train_data) 
        
    temp_df['preds'] = estimator.predict(temp_df[all_features])
    if use_log:
        temp_df['preds'] = np.expm1(temp_df['preds'])
        temp_df[TARGET] = np.expm1(temp_df[TARGET])
    temp_df = temp_df[['site_code','product_code','idx',TARGET,'preds']]
    if ('mase_constant' in remove_additional_features_selected) & (test_block != 46):
        print('MASE is {}'.format(mase_df(temp_df)))
    return estimator, temp_df

## Modeling

Get MASE constant (denominator) for each CV

In [3]:
summary_block = get_mase_constant_agg()

Read data, data frame size: (61065, 20)
Get cumulative nonzero flag
Get MASE constant
Get MASE constant
Get MASE constant
Get MASE constant
Get MASE constant


### LightGBM Separate Models

Generate prediction for all CV using different seed (1010, 2020, 3030) using different LightGBM models for each horizon predictio (t+1, t+2, t+3)

In [4]:
lgb_params = {'boosting_type': 'gbdt', 
              'objective': 'mean_absolute_error',
              'metric': ['mae'], 
              'learning_rate': 0.025,      
              'subsample': 0.9,       
              'subsample_freq': 1,     
              'num_leaves': 255,            
              'min_data_in_leaf': 255, 
              'feature_fraction': 0.5,
              'n_estimators': 1000,   
              'seed': SEED,
              'verbose': -1}

In [5]:
df = process_data(features = numerical_features)
df.head()

Read data, data frame size: (61065, 20)
Get cumulative nonzero flag
Generate lag features [1, 2, 3, 4], data frame size: (59708, 50)


Unnamed: 0,site_code,product_code,region,district,stock_initial,stock_received,stock_distributed,stock_adjustment,stock_end,average_monthly_consumption,stock_stockout_days,stock_ordered,ds,isna,idx,site_type,site_latitude,site_longitude,product_type,product_name,isna_int,iszero_int,stock_initial_lag_1,stock_received_lag_1,stock_adjustment_lag_1,stock_distributed_lag_1,stock_end_lag_1,average_monthly_consumption_lag_1,stock_ordered_lag_1,stock_initial_lag_2,stock_received_lag_2,stock_adjustment_lag_2,stock_distributed_lag_2,stock_end_lag_2,average_monthly_consumption_lag_2,stock_ordered_lag_2,stock_initial_lag_3,stock_received_lag_3,stock_adjustment_lag_3,stock_distributed_lag_3,stock_end_lag_3,average_monthly_consumption_lag_3,stock_ordered_lag_3,stock_initial_lag_4,stock_received_lag_4,stock_adjustment_lag_4,stock_distributed_lag_4,stock_end_lag_4,average_monthly_consumption_lag_4,stock_ordered_lag_4
4,C1004,AS21126,AGNEBY-TIASSA-ME,AGBOVILLE,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,2016-05-01,True,5,Hospital,5.92834,-4.21145,Injectable Contraceptive,MEDROXYPROGESTERONE 104MG/0.65ML INJ UNITE (SA...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,C1004,AS21126,AGNEBY-TIASSA-ME,AGBOVILLE,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,2016-06-01,True,6,Hospital,5.92834,-4.21145,Injectable Contraceptive,MEDROXYPROGESTERONE 104MG/0.65ML INJ UNITE (SA...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,C1004,AS21126,AGNEBY-TIASSA-ME,AGBOVILLE,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,2016-07-01,True,7,Hospital,5.92834,-4.21145,Injectable Contraceptive,MEDROXYPROGESTERONE 104MG/0.65ML INJ UNITE (SA...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,C1004,AS21126,AGNEBY-TIASSA-ME,AGBOVILLE,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,2016-08-01,True,8,Hospital,5.92834,-4.21145,Injectable Contraceptive,MEDROXYPROGESTERONE 104MG/0.65ML INJ UNITE (SA...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,C1004,AS21126,AGNEBY-TIASSA-ME,AGBOVILLE,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,2016-09-01,True,9,Hospital,5.92834,-4.21145,Injectable Contraceptive,MEDROXYPROGESTERONE 104MG/0.65ML INJ UNITE (SA...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
pred1 = process_train_cv(df, is_print=False, verbose=-1, 
                         version='allf_seed_2020_remove_first_na',
                         use_separate_model=True, remove_first_na=True)

Train data frame size: (41653, 22)
Train time block 5 42
Train data frame size: (41653, 29)
Train time block 5 42
Train data frame size: (41653, 36)
Train time block 5 42
Train data frame size: (37636, 22)
Train time block 5 39
Train data frame size: (37636, 29)
Train time block 5 39
Train data frame size: (37636, 36)
Train time block 5 39
Train data frame size: (33867, 22)
Train time block 5 36
Train data frame size: (33867, 29)
Train time block 5 36
Train data frame size: (33867, 36)
Train time block 5 36
Train data frame size: (30229, 22)
Train time block 5 33
Train data frame size: (30229, 29)
Train time block 5 33
Train data frame size: (30229, 36)
Train time block 5 33
Train data frame size: (45704, 22)
Train time block 5 45
Train data frame size: (45704, 29)
Train time block 5 45
Train data frame size: (45704, 36)
Train time block 5 45
CV details is [0.9557, 1.1277, 1.0153, 1.0285]
CV-1 is 0.9557, CV mean is 1.0318 and CV std is 0.0618
MASE-CV details is [9.6989, 9.6286, 9.8592,

In [7]:
lgb_params.update({'seed': 3030})
pred2 = process_train_cv(df, is_print=False, verbose=-1, 
                         version='allf_seed_3030_remove_first_na',
                         use_separate_model=True, remove_first_na=True)

Train data frame size: (41653, 22)
Train time block 5 42
Train data frame size: (41653, 29)
Train time block 5 42
Train data frame size: (41653, 36)
Train time block 5 42
Train data frame size: (37636, 22)
Train time block 5 39
Train data frame size: (37636, 29)
Train time block 5 39
Train data frame size: (37636, 36)
Train time block 5 39
Train data frame size: (33867, 22)
Train time block 5 36
Train data frame size: (33867, 29)
Train time block 5 36
Train data frame size: (33867, 36)
Train time block 5 36
Train data frame size: (30229, 22)
Train time block 5 33
Train data frame size: (30229, 29)
Train time block 5 33
Train data frame size: (30229, 36)
Train time block 5 33
Train data frame size: (45704, 22)
Train time block 5 45
Train data frame size: (45704, 29)
Train time block 5 45
Train data frame size: (45704, 36)
Train time block 5 45
CV details is [0.9587, 1.1365, 1.0223, 1.0349]
CV-1 is 0.9587, CV mean is 1.0381 and CV std is 0.0637
MASE-CV details is [9.7176, 9.6865, 9.8932,

In [8]:
lgb_params.update({'seed': 1010})
pred3 = process_train_cv(df, is_print=False, verbose=-1, 
                         version='allf_seed_1010_remove_first_na',
                         use_separate_model=True, remove_first_na=True)

Train data frame size: (41653, 22)
Train time block 5 42
Train data frame size: (41653, 29)
Train time block 5 42
Train data frame size: (41653, 36)
Train time block 5 42
Train data frame size: (37636, 22)
Train time block 5 39
Train data frame size: (37636, 29)
Train time block 5 39
Train data frame size: (37636, 36)
Train time block 5 39
Train data frame size: (33867, 22)
Train time block 5 36
Train data frame size: (33867, 29)
Train time block 5 36
Train data frame size: (33867, 36)
Train time block 5 36
Train data frame size: (30229, 22)
Train time block 5 33
Train data frame size: (30229, 29)
Train time block 5 33
Train data frame size: (30229, 36)
Train time block 5 33
Train data frame size: (45704, 22)
Train time block 5 45
Train data frame size: (45704, 29)
Train time block 5 45
Train data frame size: (45704, 36)
Train time block 5 45
CV details is [0.9555, 1.1313, 1.0116, 1.0267]
CV-1 is 0.9555, CV mean is 1.0313 and CV std is 0.0636
MASE-CV details is [9.6858, 9.6567, 9.9324,

### Use Single Feature Only

In [9]:
lgb_params = {'boosting_type': 'gbdt', 
              'objective': 'mean_absolute_error',
              'metric': ['mae'], 
              'learning_rate': 0.025,      
              'subsample': 0.9,       
              'subsample_freq': 1,     
              'num_leaves': 255,            
              'min_data_in_leaf': 255, 
              'feature_fraction': 0.5,
              'n_estimators': 1000,   
              'seed': SEED,
              'verbose': -1}
df = process_data()
pred4 = process_train_cv(df, is_print=False, verbose=-1, 
                         version='basef_seed_2020_diff',
                         use_separate_model=True)

Read data, data frame size: (61065, 20)
Get cumulative nonzero flag
Generate lag features [1, 2, 3, 4], data frame size: (59708, 26)
Train data frame size: (51566, 10)
Train time block 5 42
Train data frame size: (51566, 11)
Train time block 5 42
Train data frame size: (51566, 12)
Train time block 5 42
Train data frame size: (47495, 10)
Train time block 5 39
Train data frame size: (47495, 11)
Train time block 5 39
Train data frame size: (47495, 12)
Train time block 5 39
Train data frame size: (43424, 10)
Train time block 5 36
Train data frame size: (43424, 11)
Train time block 5 36
Train data frame size: (43424, 12)
Train time block 5 36
Train data frame size: (39353, 10)
Train time block 5 33
Train data frame size: (39353, 11)
Train time block 5 33
Train data frame size: (39353, 12)
Train time block 5 33
Train data frame size: (55637, 10)
Train time block 5 45
Train data frame size: (55637, 11)
Train time block 5 45
Train data frame size: (55637, 12)
Train time block 5 45
CV details i

In [10]:
lgb_params = {'boosting_type': 'gbdt', 
              'objective': 'mean_absolute_error',
              'metric': ['mae'], 
              'learning_rate': 0.025,      
              'subsample': 0.9,       
              'subsample_freq': 1,     
              'num_leaves': 255,            
              'min_data_in_leaf': 255, 
              'feature_fraction': 0.5,
              'n_estimators': 1000,   
              'seed': SEED,
              'verbose': -1}
df = process_data()
df = generate_diff_features(df)
pred4 = process_train_cv(df, is_print=False, verbose=-1, 
                         version='basef_seed_2020_diff_2',
                         use_separate_model=True)

Read data, data frame size: (61065, 20)
Get cumulative nonzero flag
Generate lag features [1, 2, 3, 4], data frame size: (59708, 26)
Generate diff features
Train data frame size: (51566, 11)
Train time block 5 42
Train data frame size: (51566, 14)
Train time block 5 42
Train data frame size: (51566, 18)
Train time block 5 42
Train data frame size: (47495, 11)
Train time block 5 39
Train data frame size: (47495, 14)
Train time block 5 39
Train data frame size: (47495, 18)
Train time block 5 39
Train data frame size: (43424, 11)
Train time block 5 36
Train data frame size: (43424, 14)
Train time block 5 36
Train data frame size: (43424, 18)
Train time block 5 36
Train data frame size: (39353, 11)
Train time block 5 33
Train data frame size: (39353, 14)
Train time block 5 33
Train data frame size: (39353, 18)
Train time block 5 33
Train data frame size: (55637, 11)
Train time block 5 45
Train data frame size: (55637, 14)
Train time block 5 45
Train data frame size: (55637, 18)
Train time 

### RMSE

In [None]:
lgb_params = {'boosting_type': 'gbdt', 
              'objective': 'regre',
              'metric': ['mae'], 
              'learning_rate': 0.025,      
              'subsample': 0.9,       
              'subsample_freq': 1,     
              'num_leaves': 255,            
              'min_data_in_leaf': 255, 
              'feature_fraction': 0.5,
              'n_estimators': 1000,   
              'seed': SEED,
              'verbose': -1}