In [11]:
import dask.dataframe as dask_df
import numpy as np
import pandas as pd
import plotly.express as px
import warnings
import pickle
from distributed import Client
from lightgbm import LGBMRegressor
from lightgbm import Booster
from lightgbm import plot_importance
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import LabelEncoder

# suppress warning messages
warnings.filterwarnings('ignore')

client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:61741  Dashboard: http://127.0.0.1:61740/status,Cluster  Workers: 4  Cores: 8  Memory: 8.43 GB


In [2]:
# this function identifies the smallest data type that can hold the largest value in a numeric dataframe column
# takes in a list of triads (column header, current data type, maximum column value)
# returns a dictionary of column header as key and datatype as value

def identify_numeric_type(list):
    
    new_type = []
    
    for data_head, data_type, data_max in list:
        if 'int' in str(data_type):
            if data_max < np.iinfo(np.int8).max:
                new_type.append((data_head, 'int8'))
            elif data_max < np.iinfo(np.int16).max:
                new_type.append((data_head, 'int16'))
            elif data_max < np.iinfo(np.int32).max:
                new_type.append((data_head, 'int32'))
            elif data_max < np.iinfo(np.int64).max:
                new_type.append((data_head, 'int64'))
        elif 'float' in str(data_type):
            if data_max < np.finfo(np.float16).max:
                new_type.append((data_head, 'float16'))
            elif data_max < np.finfo(np.float32).max:
                new_type.append((data_head, 'float32'))
            elif data_max < np.finfo(np.float64).max:
                new_type.append((data_head, 'float64'))
                
    return dict(new_type)

In [3]:
# train and test dataframes are around 852.9 MB and 27.4 MB in size

train = dask_df.read_csv('./m5-forecasting-accuracy/train_test_split/train/*', header = 'infer', \
                         parse_dates = ['date']).compute().set_index('date')
train.info(verbose = False)

test = dask_df.read_csv('./m5-forecasting-accuracy/train_test_split/test/*', header = 'infer', \
                        parse_dates = ['date']).compute().set_index('date')
test.info(verbose = False)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2661777 entries, 2013-12-04 to 2016-04-24
Columns: 41 entries, store_id to variance_trend_lag_7
dtypes: float64(18), int64(15), object(8)
memory usage: 852.9+ MB
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 85372 entries, 2016-04-25 to 2016-05-22
Columns: 41 entries, store_id to variance_trend_lag_7
dtypes: float64(18), int64(15), object(8)
memory usage: 27.4+ MB


In [4]:
# converting numeric type columns to minimal datatypes to reduce storage

train = train.drop('day', axis = 1)
test = test.drop('day', axis = 1)

numeric_columns = train.select_dtypes(include = np.number).columns.tolist()

data_head = train.loc[:, numeric_columns].columns.to_list()
data_type = train.loc[:, numeric_columns].dtypes.to_list()
data_max = train.loc[:, numeric_columns].max().to_list()
data_list = np.stack([data_head, data_type, data_max], axis = 1)

train_type_dict = identify_numeric_type(data_list)
train = train.astype(train_type_dict)

data_head = test.loc[:, numeric_columns].columns.to_list()
data_type = test.loc[:, numeric_columns].dtypes.to_list()
data_max = test.loc[:, numeric_columns].max().to_list()
data_list = np.stack([data_head, data_type, data_max], axis = 1)

test_type_dict = identify_numeric_type(data_list)
test = test.astype(test_type_dict)

# it reduces the space used by train dataframe to 332.5 MB
train.info(verbose = False)

# it reduces the space used by test dataframe to 10.2 MB
test.info(verbose = False)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2661777 entries, 2013-12-04 to 2016-04-24
Columns: 40 entries, store_id to variance_trend_lag_7
dtypes: float16(18), int16(9), int8(5), object(8)
memory usage: 332.5+ MB
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 85372 entries, 2016-04-25 to 2016-05-22
Columns: 40 entries, store_id to variance_trend_lag_7
dtypes: float16(18), int16(3), int8(11), object(8)
memory usage: 10.2+ MB


In [5]:
# encoding values in all non-numeric type columns to numbers
# saving the LabelEncoder objects to disk to use them for decoding later

other_columns = train.select_dtypes(exclude = np.number).columns.tolist()

for column in other_columns:
    label_encoder = LabelEncoder()
    train[column] = label_encoder.fit_transform(train[column])
    test[column] = label_encoder.transform(test[column])
    pickle.dump(label_encoder, open('./Pickle/label_encoder_' + column + '.pkl', 'wb'))

In [6]:
# splitting training, validation, and testing sets into features and target variable

X_train = train[train.index < '2016-3-28'].drop('units_sold', axis = 1)
y_train = train[train.index < '2016-3-28']['units_sold']

X_validation = train[train.index >= '2016-3-28'].drop('units_sold', axis = 1)
y_validation = train[train.index >= '2016-3-28']['units_sold']

X_test = test.drop('units_sold', axis = 1)
y_test = test['units_sold']

In [43]:
# running a randomized search for hyperparameters of LGBM Regressor using a time series cross validation of 5 splits

%%time

lgbm_estimator = LGBMRegressor()

param_distributions = {'boosting_type': ['gbdt'],
                       'objective': ['tweedie'],
                       'tweedie_variance_power': [1.1, 1.3, 1.5],
                       'n_estimators': [500, 1000],
                       'metric': ['rmse'],
                       'max_depth': [10, 30, 50],
                       'num_leaves': [50, 100, 250, 500],
                       'learning_rate': [0.03, 0.1, 0.3],
                       'feature_fraction': [0.5, 0.7],
                       'bagging_fraction': [0.5, 0.7]}

n_iter = 1

time_series_split = TimeSeriesSplit(n_splits = 5)

randomized_search_cv = RandomizedSearchCV(estimator = lgbm_estimator,
                                   param_distributions = param_distributions,
                                   n_iter = n_iter,
                                   cv = time_series_split,
                                   scoring = 'neg_mean_squared_error',
                                   n_jobs = -1)

randomized_search_cv.fit(X_train,
                         y_train,
                         eval_metric = 'rmse',
                         eval_set = [(X_train, y_train), (X_validation, y_validation)],
                         verbose = 1)

[1]	training's rmse: 3.76023	valid_1's rmse: 3.45158
[2]	training's rmse: 3.73757	valid_1's rmse: 3.42874
[3]	training's rmse: 3.71597	valid_1's rmse: 3.40682
[4]	training's rmse: 3.69352	valid_1's rmse: 3.3841
[5]	training's rmse: 3.67099	valid_1's rmse: 3.36136
[6]	training's rmse: 3.648	valid_1's rmse: 3.33833
[7]	training's rmse: 3.62462	valid_1's rmse: 3.31485
[8]	training's rmse: 3.60196	valid_1's rmse: 3.29182
[9]	training's rmse: 3.57857	valid_1's rmse: 3.26842
[10]	training's rmse: 3.55479	valid_1's rmse: 3.2448
[11]	training's rmse: 3.53058	valid_1's rmse: 3.22075
[12]	training's rmse: 3.50651	valid_1's rmse: 3.19684
[13]	training's rmse: 3.48206	valid_1's rmse: 3.17261
[14]	training's rmse: 3.45739	valid_1's rmse: 3.14835
[15]	training's rmse: 3.43276	valid_1's rmse: 3.12407
[16]	training's rmse: 3.40764	valid_1's rmse: 3.0994
[17]	training's rmse: 3.3825	valid_1's rmse: 3.0749
[18]	training's rmse: 3.35796	valid_1's rmse: 3.05086
[19]	training's rmse: 3.33239	valid_1's rmse

[149]	training's rmse: 2.16144	valid_1's rmse: 2.02084
[150]	training's rmse: 2.16047	valid_1's rmse: 2.02062
[151]	training's rmse: 2.15969	valid_1's rmse: 2.01959
[152]	training's rmse: 2.15916	valid_1's rmse: 2.0193
[153]	training's rmse: 2.15837	valid_1's rmse: 2.01909
[154]	training's rmse: 2.15768	valid_1's rmse: 2.01901
[155]	training's rmse: 2.15654	valid_1's rmse: 2.01876
[156]	training's rmse: 2.15548	valid_1's rmse: 2.01844
[157]	training's rmse: 2.15501	valid_1's rmse: 2.01838
[158]	training's rmse: 2.15415	valid_1's rmse: 2.01837
[159]	training's rmse: 2.15353	valid_1's rmse: 2.01815
[160]	training's rmse: 2.15238	valid_1's rmse: 2.01758
[161]	training's rmse: 2.15172	valid_1's rmse: 2.01722
[162]	training's rmse: 2.1509	valid_1's rmse: 2.01677
[163]	training's rmse: 2.15012	valid_1's rmse: 2.01642
[164]	training's rmse: 2.1497	valid_1's rmse: 2.01634
[165]	training's rmse: 2.1488	valid_1's rmse: 2.01479
[166]	training's rmse: 2.14818	valid_1's rmse: 2.0146
[167]	training'

[299]	training's rmse: 2.09611	valid_1's rmse: 1.99068
[300]	training's rmse: 2.09585	valid_1's rmse: 1.9906
[301]	training's rmse: 2.0952	valid_1's rmse: 1.99027
[302]	training's rmse: 2.09497	valid_1's rmse: 1.99006
[303]	training's rmse: 2.09461	valid_1's rmse: 1.98982
[304]	training's rmse: 2.0945	valid_1's rmse: 1.98986
[305]	training's rmse: 2.09393	valid_1's rmse: 1.9892
[306]	training's rmse: 2.09385	valid_1's rmse: 1.98918
[307]	training's rmse: 2.09331	valid_1's rmse: 1.98734
[308]	training's rmse: 2.09295	valid_1's rmse: 1.98715
[309]	training's rmse: 2.09291	valid_1's rmse: 1.98711
[310]	training's rmse: 2.09237	valid_1's rmse: 1.98702
[311]	training's rmse: 2.09178	valid_1's rmse: 1.98687
[312]	training's rmse: 2.09147	valid_1's rmse: 1.98675
[313]	training's rmse: 2.09126	valid_1's rmse: 1.98667
[314]	training's rmse: 2.09107	valid_1's rmse: 1.98649
[315]	training's rmse: 2.09045	valid_1's rmse: 1.98632
[316]	training's rmse: 2.0899	valid_1's rmse: 1.98614
[317]	training'

[449]	training's rmse: 2.04878	valid_1's rmse: 1.96153
[450]	training's rmse: 2.04854	valid_1's rmse: 1.96141
[451]	training's rmse: 2.04847	valid_1's rmse: 1.9614
[452]	training's rmse: 2.04781	valid_1's rmse: 1.96116
[453]	training's rmse: 2.04708	valid_1's rmse: 1.96083
[454]	training's rmse: 2.04706	valid_1's rmse: 1.96079
[455]	training's rmse: 2.04659	valid_1's rmse: 1.96045
[456]	training's rmse: 2.04622	valid_1's rmse: 1.96019
[457]	training's rmse: 2.04585	valid_1's rmse: 1.9598
[458]	training's rmse: 2.04567	valid_1's rmse: 1.95973
[459]	training's rmse: 2.04474	valid_1's rmse: 1.95952
[460]	training's rmse: 2.0443	valid_1's rmse: 1.95927
[461]	training's rmse: 2.04394	valid_1's rmse: 1.95913
[462]	training's rmse: 2.04376	valid_1's rmse: 1.95805
[463]	training's rmse: 2.04367	valid_1's rmse: 1.95803
[464]	training's rmse: 2.04364	valid_1's rmse: 1.95801
[465]	training's rmse: 2.04316	valid_1's rmse: 1.95768
[466]	training's rmse: 2.04244	valid_1's rmse: 1.95723
[467]	trainin

[600]	training's rmse: 2.00857	valid_1's rmse: 1.93496
[601]	training's rmse: 2.00819	valid_1's rmse: 1.93452
[602]	training's rmse: 2.00738	valid_1's rmse: 1.93427
[603]	training's rmse: 2.00706	valid_1's rmse: 1.93411
[604]	training's rmse: 2.00678	valid_1's rmse: 1.93401
[605]	training's rmse: 2.00664	valid_1's rmse: 1.93383
[606]	training's rmse: 2.00662	valid_1's rmse: 1.93382
[607]	training's rmse: 2.00634	valid_1's rmse: 1.9338
[608]	training's rmse: 2.00615	valid_1's rmse: 1.93376
[609]	training's rmse: 2.00598	valid_1's rmse: 1.93362
[610]	training's rmse: 2.00596	valid_1's rmse: 1.93361
[611]	training's rmse: 2.00574	valid_1's rmse: 1.93326
[612]	training's rmse: 2.00562	valid_1's rmse: 1.93297
[613]	training's rmse: 2.00531	valid_1's rmse: 1.93257
[614]	training's rmse: 2.00522	valid_1's rmse: 1.93256
[615]	training's rmse: 2.00503	valid_1's rmse: 1.93218
[616]	training's rmse: 2.00472	valid_1's rmse: 1.93201
[617]	training's rmse: 2.00387	valid_1's rmse: 1.93072
[618]	train

[750]	training's rmse: 1.97086	valid_1's rmse: 1.90672
[751]	training's rmse: 1.97071	valid_1's rmse: 1.90674
[752]	training's rmse: 1.97067	valid_1's rmse: 1.90669
[753]	training's rmse: 1.97046	valid_1's rmse: 1.90666
[754]	training's rmse: 1.97025	valid_1's rmse: 1.90643
[755]	training's rmse: 1.97021	valid_1's rmse: 1.90643
[756]	training's rmse: 1.97001	valid_1's rmse: 1.90619
[757]	training's rmse: 1.96996	valid_1's rmse: 1.90617
[758]	training's rmse: 1.96967	valid_1's rmse: 1.90609
[759]	training's rmse: 1.96931	valid_1's rmse: 1.90587
[760]	training's rmse: 1.96891	valid_1's rmse: 1.90553
[761]	training's rmse: 1.96867	valid_1's rmse: 1.90534
[762]	training's rmse: 1.96853	valid_1's rmse: 1.90522
[763]	training's rmse: 1.96849	valid_1's rmse: 1.90513
[764]	training's rmse: 1.96828	valid_1's rmse: 1.90501
[765]	training's rmse: 1.96825	valid_1's rmse: 1.90498
[766]	training's rmse: 1.96822	valid_1's rmse: 1.90496
[767]	training's rmse: 1.96813	valid_1's rmse: 1.90488
[768]	trai

[900]	training's rmse: 1.94017	valid_1's rmse: 1.88042
[901]	training's rmse: 1.93951	valid_1's rmse: 1.87984
[902]	training's rmse: 1.93949	valid_1's rmse: 1.87982
[903]	training's rmse: 1.93938	valid_1's rmse: 1.87976
[904]	training's rmse: 1.93935	valid_1's rmse: 1.87974
[905]	training's rmse: 1.93894	valid_1's rmse: 1.8794
[906]	training's rmse: 1.93865	valid_1's rmse: 1.87916
[907]	training's rmse: 1.93841	valid_1's rmse: 1.87768
[908]	training's rmse: 1.93826	valid_1's rmse: 1.87768
[909]	training's rmse: 1.93801	valid_1's rmse: 1.8776
[910]	training's rmse: 1.93729	valid_1's rmse: 1.87709
[911]	training's rmse: 1.93716	valid_1's rmse: 1.87714
[912]	training's rmse: 1.93714	valid_1's rmse: 1.87714
[913]	training's rmse: 1.93713	valid_1's rmse: 1.87714
[914]	training's rmse: 1.93664	valid_1's rmse: 1.87704
[915]	training's rmse: 1.93661	valid_1's rmse: 1.87701
[916]	training's rmse: 1.93578	valid_1's rmse: 1.8767
[917]	training's rmse: 1.93576	valid_1's rmse: 1.8767
[918]	training

RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=LGBMRegressor(), n_iter=1, n_jobs=-1,
                   param_distributions={'bagging_fraction': [0.5, 0.7],
                                        'boosting_type': ['gbdt'],
                                        'feature_fraction': [0.5, 0.7],
                                        'learning_rate': [0.03, 0.1, 0.3],
                                        'max_depth': [10, 30, 50],
                                        'metric': ['rmse'],
                                        'n_estimators': [500, 1000],
                                        'num_leaves': [50, 100, 250, 500],
                                        'objective': ['tweedie'],
                                        'tweedie_variance_power': [1.1, 1.3,
                                                                   1.5]},
                   scoring='neg_mean_squared_error')

In [44]:
# hyperparameters of the best estimator obtained by the randomized search
# we do not need to run this step again

randomized_search_cv.best_estimator_

LGBMRegressor(bagging_fraction=0.7, feature_fraction=0.5, learning_rate=0.03,
              max_depth=10, metric='rmse', n_estimators=1000, num_leaves=250,
              objective='tweedie', tweedie_variance_power=1.3)

In [7]:
# training an LGBM Regressor over the entire training set using the best hyperparameters obtained above
# saving to model to disk for later use

X_train = train.drop('units_sold', axis = 1)
y_train = train['units_sold']

lgbm_regressor_model = LGBMRegressor(objective = 'tweedie',
                                     tweedie_variance_power = 1.3,
                                     boosting_type = 'gbdt',
                                     metric = 'rmse',
                                     n_estimators = 1000,
                                     num_leaves = 250,
                                     max_depth = 10,
                                     learning_rate = 0.03,
                                     feature_fraction = 0.5,
                                     bagging_fraction = 0.7,
                                     n_iter = 1000)

lgbm_regressor_model.fit(X_train, y_train, eval_set = [(X_train, y_train)], \
                         eval_metric = 'rmse', early_stopping_rounds = 20, verbose = 20)

lgbm_regressor_model.booster_.save_model('C:/Big Data Project/Models/lgbm_regressor_model.mdl')

Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 3.3074
[40]	training's rmse: 2.8196
[60]	training's rmse: 2.48248
[80]	training's rmse: 2.31131
[100]	training's rmse: 2.23027
[120]	training's rmse: 2.19243
[140]	training's rmse: 2.16832
[160]	training's rmse: 2.15238
[180]	training's rmse: 2.13978
[200]	training's rmse: 2.12992
[220]	training's rmse: 2.12245
[240]	training's rmse: 2.11535
[260]	training's rmse: 2.10995
[280]	training's rmse: 2.10216
[300]	training's rmse: 2.09585
[320]	training's rmse: 2.08913
[340]	training's rmse: 2.08379
[360]	training's rmse: 2.07802
[380]	training's rmse: 2.07192
[400]	training's rmse: 2.06492
[420]	training's rmse: 2.05892
[440]	training's rmse: 2.05184
[460]	training's rmse: 2.0443
[480]	training's rmse: 2.03818
[500]	training's rmse: 2.03357
[520]	training's rmse: 2.0272
[540]	training's rmse: 2.02263
[560]	training's rmse: 2.0182
[580]	training's rmse: 2.01305
[600]	training's rmse: 2.00857
[620]	training's r

<lightgbm.basic.Booster at 0x1fc02b52080>

In [8]:
# load the LGBM Regressor model from the disk
# obtain predictions over the test set and evaluate using the metric rmse

lgbm_regressor_model = Booster(model_file = 'C:/Big Data Project/Models/lgbm_regressor_model.mdl')

y_test_predictions = lgbm_regressor_model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_test_predictions))

In [9]:
rmse

2.1234526334106243

In [14]:
# obtain feature importances from the LGBM Regressor generated

feature_importances_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': lgbm_regressor_model.feature_importance()})

In [18]:
# the plot below shows the feature importances of our features in a descending order

figure = px.bar(feature_importances_df, x = 'Importance', y = 'Feature', orientation = 'h')
figure.update_layout(title_text = 'Feature Importance Plot: Random Forests Regression', template = 'seaborn', height = 800, \
                     yaxis={'categoryorder':'total ascending'})
figure.show()