# Predicting sales using LGBM

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV

import lightgbm as lgb


## Read in data

In [28]:
# read in features of one store (TX_2)
features = pd.read_pickle('TX_2_features.pkl').dropna()

## Define cost function

To simplify things, we use RMSE as an approximate. In future, a custom loss function can be defined for this particular problem

In [7]:
def rmse(y_true,y_pred):
    return np.sqrt(mse(y_true,y_pred))

## Prepare training and validation set

In [10]:
train = features[(features.d<=1863) & (features.d>=1069+NUM_LAG_DAYS)].dropna()
val = features[(features.d > 1863) & (features.d < 1914)].dropna()
test = features[features.d >= 1914]
drop_cols = ['sales','id','d','wm_yr_wk','original_id','store_id','item_id']
num_weeks = NUM_LAG_DAYS // 7
for j in range(1,num_weeks+1):
    drop_cols += ['lag'+str(j) for j in range(j*7-6,j*7)]
categorical_cols = ['event_name_1','event_name_2','event_type_1','event_type_2']
x_train = train.drop(drop_cols,axis=1)
y_train = train.sales.values
x_val = val.drop(drop_cols,axis=1)
y_val = val.sales.values    

## Finding the best hyperparameters using GridSearch

There are a lot of hyperparameters that we can tune, such as number of leaves, maximum depth, how we sample the data and features etc. We can find the best set of parameters using GridSearch

In [25]:
# lgbm gridsearch
lgb_params = {
        'boosting_type': 'gbdt',         
        'objective': 'regression',       
        'metric': ['rmse'],             
        'subsample': 0.8,                
        'subsample_freq': 1,
        'learning_rate': 0.03,           
        'num_leaves': 2**9-1,            
        'min_data_in_leaf': 2**8-1,      
        'feature_fraction': 0.8,
        'n_estimators': 1,            
        'early_stopping_rounds': 30,     
        'verbose': -1,
        'max_bin':2**9-1
            } 
train_set = lgb.Dataset(x_train, y_train)
val_set = lgb.Dataset(x_val, y_val)

lgbm_model = lgb.LGBMRegressor(boosting_type='gbdt',
                                num_leaves=2**9,
                               learning_rate=0.05,
                               n_estimators=10,
                               subsample=0.8,
                               subsample_freq=1,
                               objective='regression',
                              min_child_sample=2**4)
gridParams = {
    'num_leaves': [2**9,2**10], 
    'subsample' : [0.6,0.7,0.8],
    'min_child_sample': [2**4,2**5],      
    'feature_fraction': [0.6,0.7,0.8],
    }

grid = GridSearchCV(lgbm_model, gridParams, verbose=1,cv=2, n_jobs=-1)
grid.fit(x_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  6.6min finished


{'feature_fraction': 0.6, 'min_child_sample': 16, 'num_leaves': 512, 'subsample': 0.6}
nan


## Training an LGBM model 

Using the best performing hyperparameters found using GridSearch, we can train the model. Of note, training will be stopped pre-maturely when validation RMSE does not improve for 30 boosting rounds.

In [26]:
lgb_params = {
        'boosting_type': 'gbdt',         
        'objective': 'regression',       
        'metric': ['rmse'],             
        'subsample': 0.6,                
        'subsample_freq': 1,
        'learning_rate': 0.05,           
        'num_leaves': 2**9,            
        'min_data_in_leaf': 16,      
        'feature_fraction': 0.6,
        'n_estimators': 5000,            
        'early_stopping_rounds': 30,     
        'verbose': -1,
            } 
train_set = lgb.Dataset(x_train, y_train)
val_set = lgb.Dataset(x_val, y_val)
lgb_model = lgb.train(lgb_params, train_set, num_boost_round = 2000, valid_sets = [train_set, val_set], verbose_eval = 100)

Training until validation scores don't improve for 30 rounds
[100]	training's rmse: 1.76869	valid_1's rmse: 1.78133
Early stopping, best iteration is:
[79]	training's rmse: 1.81904	valid_1's rmse: 1.77623


In [29]:
# class DataFrameSelector(BaseEstimator, TransformerMixin):
#     def __init__(self, attribute_names):
#         self.attribute_names = attribute_names
#     def fit(self, X, y=None):
#         return self
#     def transform(self, X):
#         return X[self.attribute_names].values

# # construct feature preprocessing pipelines
# categorical_cols = ['event_name_1','event_name_2','event_type_1','event_type_2']
# numerical_cols = [col for col in features.columns if col not in categorical_cols]

# numerical_pipeline = Pipeline([
#     ('selector', DataFrameSelector(numerical_cols)),
# #     ('std_scaler', StandardScaler()),
# ])
# categorical_pipeline = Pipeline([
#     ('selector', DataFrameSelector(categorical_cols)),
#     ('one_hot_encoder', OneHotEncoder(sparse=False))
# ])

# full_pipeline = FeatureUnion(transformer_list=[
#     ("numerical_pipeline", numerical_pipeline),
#     ("categorical_pipeline", categorical_pipeline),
# ])

# NUM_LAG_DAYS = 28

# drop_cols = ['sales','id','d','wm_yr_wk','original_id','store_id','item_id']
# num_weeks = NUM_LAG_DAYS // 7
# for j in range(1,num_weeks+1):
#     drop_cols += ['lag'+str(j) for j in range(j*7-6,j*7)]
# y = features.sales
# lm_features = features.drop(drop_cols,axis=1) 
# lm_features = full_pipeline.fit_transform(features)

# train-test split
# cut = lm_features.shape[0]//5
# x_train = lm_features[cut:]
# x_val = lm_features[:cut]
# y_train = y[cut:]
# y_val = y[:cut]  

# best_lasso_model = None
# best_lasso_score = float('inf')
# for a in tqdm(range(1,51)):
#     lasso_model = Lasso(alpha=a/10).fit(x_train.drop(categorical_cols,axis=1),y_train)
#     val_pred_lasso = lasso_model.predict(x_val.drop(categorical_cols,axis=1))
#     val_score_lasso = rmse(val_pred_lasso,y_val)
#     if val_score_lasso < best_lasso_score:
#         best_lasso_score = val_score_lasso
#         best_lasso_model = lasso_model
# print(f'RMSE for lasso: {best_lasso_score}')
# best_ridge_model = None
# best_ridge_score = float('inf')
# for a in tqdm(range(1,51)):
#     ridge_model = Ridge(alpha=a/10).fit(x_train.drop(categorical_cols,axis=1),y_train)
#     val_pred_ridge = ridge_model.predict(x_val.drop(categorical_cols,axis=1))
#     val_score_ridge = rmse(val_pred_ridge,y_val)
#     if val_score_ridge < best_ridge_score:
#         best_ridge_score = val_score_ridge
#         best_ridge_model = ridge_model
# print(f'RMSE for ridge: {best_ridge_score}')

KeyError: "['lag6', 'lag12', 'lag16', 'sales', 'lag17', 'lag8', 'lag26', 'lag3', 'lag15', 'lag2', 'lag19', 'lag22', 'lag4', 'lag23', 'lag9', 'original_id', 'lag10', 'd', 'lag27', 'lag25', 'store_id', 'lag1', 'id', 'lag5', 'wm_yr_wk', 'lag18', 'item_id', 'lag11', 'lag24', 'lag13', 'lag20'] not in index"