In [1]:
#importing necessary libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor


import lightgbm as lgb
import xgboost as xgb

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from tqdm import tqdm
from lightgbm import LGBMRegressor, Dataset
from sklearn.metrics import mean_squared_error as mse
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

import math
import pickle
import joblib
from downcast import reduce

import warnings
warnings.filterwarnings("ignore")


# to display all column of datapoints
pd.set_option('display.max_columns', None)

pd.options.mode.chained_assignment = None
pd.set_option('display.float_format', '{:.2f}'.format)


In [2]:
#Reload the dataframe as pickle file

with open('final.pkl', 'rb') as fp:
    
    data = pickle.load(fp)
    
data=reduce(data)

In [3]:
data.shape

(29544810, 39)

In [4]:
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,sales,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_28,lag_35,lag_42,lag_49,lag_56,lag_63,lag_70,lag_77,lag_84,rolling_mean_7,rolling_median_7,rolling_mean_14,rolling_median_14,rolling_mean_28,rolling_median_28,rolling_mean_35,rolling_median_35,rolling_mean_60,rolling_median_60
30490000,14370,1437,3,1,0,0,1001,2,11339,7,10,2013,-1,-1,-1,-1,0,0,0,8.26,1.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.43,0.0,0.36,0.0,0.43,0.0,0.43,0.0,0.45,0.0
30490001,14380,1438,3,1,0,0,1001,0,11339,7,10,2013,-1,-1,-1,-1,0,0,0,3.97,2.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.17,0.0,0.15,0.0
30490002,14390,1439,3,1,0,0,1001,0,11339,7,10,2013,-1,-1,-1,-1,0,0,0,2.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30490003,14400,1440,3,1,0,0,1001,0,11339,7,10,2013,-1,-1,-1,-1,0,0,0,4.64,0.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,2.0,1.29,1.0,2.0,2.0,1.89,1.5,1.77,1.0,1.75,1.0
30490004,14410,1441,3,1,0,0,1001,1,11339,7,10,2013,-1,-1,-1,-1,0,0,0,3.08,1.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,1.0,1.07,1.0,0.96,1.0,1.11,1.0,1.23,1.0


###  **Train, Cross Validation, Test split  for the model**

In [5]:
# Now we are splitting our data into, Train, Test, Cross Validate.
# Being a time series model, we are splitting the data based on time.

# Records till day 1914 will be used for training the model.
X_train = data[(data['day']<1914)]
X_train=reduce(X_train)

# Records between days 1914 and 1941 will be used for Validation
X_val = data[(data['day']>=1914) & (data['day']<=1941)]
X_val=reduce(X_val)

# Records after day 1942 will used for final test
X_test = data[(data['day']>=1942)]
X_test=reduce(X_test)

In [6]:
y_train = X_train['sales']

y_val = X_val['sales']

In [7]:
# We are drpping the features which are not required.
X_train.drop(['sales'],axis = 1,inplace = True)

X_val.drop(['sales'],axis = 1,inplace = True)

X_test.drop(['sales'],axis = 1,inplace = True)

In [8]:
X_train.shape,y_train.shape,X_val.shape,y_val.shape,X_test.shape

((27837370, 38), (27837370,), (853720, 38), (853720,), (853720, 38))

# **LightGBM**

In [9]:
# After running several combinations, we found these parameter values to give best result

learning_rate = 0.085
num_leaves = 114
min_data_in_leaf = 124

In [11]:
# Now that we have found the best model which has the lowest RMSE score, we are using that parameter values to retrain the model.

lgb = LGBMRegressor( learning_rate=learning_rate ,
                           num_leaves=num_leaves ,
                           min_data_in_leaf=min_data_in_leaf)

lgb.fit(X_train, y_train)

y_pred = lgb.predict(X_val)

# Now that it has predicted the values for X_val, we are calculating it's rmse
rmse = mse(y_val,y_pred)
r2 = r2_score(y_val,y_pred)
    
print(f"For learning rate {learning_rate}, num_leaves {num_leaves} and min_data_in_leaf {min_data_in_leaf} the RMSE is {rmse} and R2 score is {r2}")
    

For learning rate 0.085, num_leaves 114 and min_data_in_leaf 124 the RMSE is 3.0085378251096873 and R2 score is 0.77301243329196


In [12]:
# Now we are predicting the values for teh X_val and X_test for the submission

pred_val_array  = lgb.predict(X_val)
pred_test_array  = lgb.predict(X_test)

pred_val_array = np.reshape(pred_val_array, (-1, 28),order = 'F')
pred_test_array = np.reshape(pred_test_array, (-1, 28),order = 'F')

In [13]:
sub = pd.read_csv("sample_submission.csv")
sub_1 = sub.iloc[:30490,:]
sub_2 = sub.iloc[30490:,:]
f_cols = sub.columns[1:]

for i in range(len(f_cols)):
    sub_1[f_cols[i]] = pred_val_array[:,i]
    sub_2[f_cols[i]] = pred_test_array[:,i]

sub = pd.concat([sub_1,sub_2])
sub

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.95,0.74,0.62,0.49,0.55,1.01,1.67,1.35,1.27,1.36,1.26,1.37,1.60,0.86,0.95,1.12,1.10,0.97,1.34,2.07,1.92,1.22,0.86,0.74,1.12,1.34,1.06,1.23
1,HOBBIES_1_002_CA_1_validation,0.14,0.26,0.12,0.12,0.15,0.15,0.15,0.14,0.00,0.12,0.12,0.15,0.18,0.15,0.14,0.12,-0.00,-0.00,0.01,0.18,0.53,0.53,0.62,0.62,0.62,0.68,0.72,0.35
2,HOBBIES_1_003_CA_1_validation,0.66,0.49,0.49,0.62,0.55,0.88,0.85,0.66,0.62,0.49,0.38,0.68,0.74,1.00,0.79,0.74,0.86,0.86,0.81,0.91,0.73,0.53,0.49,0.38,0.62,0.93,1.25,1.05
3,HOBBIES_1_004_CA_1_validation,1.91,1.55,1.68,1.84,2.16,1.67,2.43,2.45,2.23,2.07,1.85,1.81,2.54,2.03,1.37,1.34,1.44,1.57,1.56,1.98,1.33,1.26,0.98,0.98,1.23,1.25,1.84,2.88
4,HOBBIES_1_005_CA_1_validation,1.73,1.44,1.60,1.73,1.72,1.80,1.71,1.47,1.72,1.58,1.34,1.78,2.65,2.52,2.05,1.71,1.81,1.91,1.73,1.92,1.41,1.05,0.87,0.63,0.40,0.69,1.05,1.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0.53,0.39,0.27,0.28,0.30,0.16,0.01,-0.00,0.01,0.00,-0.01,0.01,0.01,0.01,0.01,-0.00,0.00,0.00,0.01,0.01,0.03,0.01,0.00,0.00,0.00,0.01,0.01,-0.01
60976,FOODS_3_824_WI_3_evaluation,0.28,0.27,0.27,0.13,0.15,0.01,0.01,-0.00,0.00,0.00,0.00,0.01,0.01,0.01,0.01,-0.00,0.00,0.00,0.01,0.01,0.01,0.01,0.00,0.00,0.00,0.01,0.01,-0.01
60977,FOODS_3_825_WI_3_evaluation,0.54,0.51,0.40,0.39,0.29,0.33,0.01,0.00,0.00,0.01,-0.01,0.04,0.01,0.01,0.02,0.01,0.03,0.00,0.01,0.01,0.01,0.03,0.00,-0.00,0.00,0.01,0.01,-0.00
60978,FOODS_3_826_WI_3_evaluation,1.23,0.41,0.39,0.27,0.18,0.01,0.02,0.00,0.00,0.00,0.00,0.01,0.01,0.02,0.01,0.01,0.01,0.01,0.02,0.01,0.01,0.04,0.03,0.00,0.00,0.02,0.01,0.01
