In [1]:
import numpy as np
import pandas as pd
import os, time, random

In [2]:
#remove columns which might lead to ordinal behavior
        
remove_features = ['id','item_id','dept_id','state_id','store_id', 'release', 'event_name_1','event_type_1','event_name_2','event_type_2','event_name_1_lag_1', 'event_type_1_lag_1',
                   'event_name_1_lag_2', 'event_type_1_lag_2', 'event_name_1_lag_3', 'event_type_1_lag_3',
                   'release','date','wm_yr_wk','d','sales','temp_d']

In [3]:
#read the dataset prepared after feature engineering
data_path = 'data.csv'
data = pd.read_csv(data_path)

#Understand the dataframe and datatypes for each column
print(data.info())

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4883327 entries, 0 to 4883326
Data columns (total 47 columns):
id                    object
item_id               object
dept_id               object
cat_id                object
store_id              object
state_id              object
d                     object
sales                 float64
release               int64
wm_yr_wk              int64
date                  object
event_name_1          object
event_type_1          object
event_name_2          object
event_type_2          object
snap_TX               int64
event_name_1_lag_1    object
event_type_1_lag_1    object
event_name_1_lag_2    object
event_type_1_lag_2    object
event_name_1_lag_3    object
event_type_1_lag_3    object
snap_TX_lag_1         float64
snap_TX_lag_2         float64
snap_TX_lag_3         float64
day                   int64
week                  int64
month                 int64
year                  int64
dayofweek             int64
weekend              

In [4]:
#Convert the object columns to datatype 'category'
category_columns=['id','item_id','dept_id','cat_id','store_id','state_id','event_name_1','event_type_1','event_name_2','event_type_2','event_name_1_lag_1', 'event_type_1_lag_1',
                   'event_name_1_lag_2', 'event_type_1_lag_2', 'event_name_1_lag_3', 'event_type_1_lag_3']

#convert each category in the list one 
for col in category_columns:
    data[col] = data[col].astype(str)
    data[col].fillna('0',inplace=True)
    data[col] = data[col].astype('category')

#Create a list of Store Ids for which data is considered 
stores_ids = data['store_id']
stores_ids = list(stores_ids.unique())


In [5]:
l1 = list(data.columns.values)

for col in l1:
    if col not in category_columns:
        data[col] = data[col].fillna(0)

In [6]:
#Check if it is indeed 'TX_1', since we choose this specific store for modeling purposes
#due to processing power limitations and to avoid OOM(Out of Memory) Error
stores_ids

#copy the dataframe into new df
df = data.copy()


In [7]:
#create a temporary date column with integer values which denotes day number 
#this is later used for subsetting the data into test/train
df['temp_d'] = pd.to_numeric(data['d'].str[2:])

#Once selected categorical columns are dummy encoded, 
#create list of categorical columns to remove from df 
features = [col for col in list(df) if col not in remove_features]

In [8]:
#List of features that we are finally considering for Modeling
features

['cat_id',
 'snap_TX',
 'snap_TX_lag_1',
 'snap_TX_lag_2',
 'snap_TX_lag_3',
 'day',
 'week',
 'month',
 'year',
 'dayofweek',
 'weekend',
 'lag_28',
 'lag_29',
 'lag_30',
 'lag_31',
 'lag_32',
 'lag_33',
 'lag_34',
 'lag_35',
 'rolling_mean_7',
 'rolling_std_7',
 'rolling_mean_14',
 'rolling_std_14',
 'rolling_mean_30',
 'rolling_std_30',
 'rolling_mean_60',
 'rolling_std_60']

In [9]:

#Creating variables for limiting the data by dates
START_TRAIN = 1000      # Start day for training data
END_TRAIN   = 1885      # End day of our train data, 
#28 days after this are left for testing(1886 - Start day for Testing Data)  
LimitData   = 1913      # End day for Testing Data

#Subset the data for 1000 to 1913 days
df = df[(df['temp_d']>=START_TRAIN) & (df['temp_d']<=LimitData)].reset_index(drop=True)


In [10]:
#Create train and test datasets
train_mask = df['temp_d']<=END_TRAIN
#valid_mask = train_mask&(df['temp_d']>(END_TRAIN-P_HORIZON))
preds_mask = df['temp_d']>(END_TRAIN)


train = df[train_mask.values]
test = df[preds_mask.values]

#Split both train and test datasets for independant and depandant variables 
x_train = train[features]
y_train = train[['sales']]


x_test = test[features]
y_test = test[['sales']]



In [11]:
one_hot_encoded_testing_predictors = pd.get_dummies(x_test)
one_hot_encoded_testing_predictors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85372 entries, 2597874 to 2683245
Data columns (total 29 columns):
snap_TX             85372 non-null int64
snap_TX_lag_1       85372 non-null float64
snap_TX_lag_2       85372 non-null float64
snap_TX_lag_3       85372 non-null float64
day                 85372 non-null int64
week                85372 non-null int64
month               85372 non-null int64
year                85372 non-null int64
dayofweek           85372 non-null int64
weekend             85372 non-null int64
lag_28              85372 non-null float64
lag_29              85372 non-null float64
lag_30              85372 non-null float64
lag_31              85372 non-null float64
lag_32              85372 non-null float64
lag_33              85372 non-null float64
lag_34              85372 non-null float64
lag_35              85372 non-null float64
rolling_mean_7      85372 non-null float64
rolling_std_7       85372 non-null float64
rolling_mean_14     85372 non-null fl

In [12]:
#one_hot_encoded_training_predictors

one_hot_encoded_training_predictors = pd.get_dummies(x_train)

In [13]:
x_train

Unnamed: 0,cat_id,snap_TX,snap_TX_lag_1,snap_TX_lag_2,snap_TX_lag_3,day,week,month,year,dayofweek,...,lag_34,lag_35,rolling_mean_7,rolling_std_7,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60
0,HOBBIES,0,0.0,0.0,0.0,24,43,10,2,3,...,0.0,0.0,0.1428,0.3780,0.0714,0.2673,0.16660,0.4612,0.1500,0.4443
1,HOBBIES,0,0.0,0.0,0.0,24,43,10,2,3,...,1.0,0.0,0.1428,0.3780,0.0714,0.2673,0.06665,0.2537,0.0500,0.2197
2,HOBBIES,0,0.0,0.0,0.0,24,43,10,2,3,...,0.0,0.0,1.0000,1.2910,1.1430,1.7480,1.16700,1.6840,1.3660,1.5940
3,HOBBIES,0,0.0,0.0,0.0,24,43,10,2,3,...,0.0,0.0,1.0000,1.1540,0.6430,0.9287,0.60000,0.8550,0.5835,0.9070
4,HOBBIES,0,0.0,0.0,0.0,24,43,10,2,3,...,0.0,0.0,0.1428,0.3780,0.2856,0.6113,0.33330,0.6610,0.2167,0.5240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2597869,FOODS,0,0.0,0.0,0.0,27,12,3,5,6,...,0.0,0.0,0.5713,0.9760,0.4285,0.7560,0.56700,0.8174,0.7500,1.2020
2597870,FOODS,0,0.0,0.0,0.0,27,12,3,5,6,...,0.0,0.0,0.0000,0.0000,0.0000,0.0000,0.00000,0.0000,0.0000,0.0000
2597871,FOODS,0,0.0,0.0,0.0,27,12,3,5,6,...,0.0,0.0,0.7144,0.9510,0.5713,0.7560,0.43330,0.6787,0.3333,0.6010
2597872,FOODS,0,0.0,0.0,0.0,27,12,3,5,6,...,0.0,0.0,0.5713,0.7866,0.8570,1.1670,0.73340,0.8680,0.8667,0.9650


In [14]:
#Import the necessary libraries for Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

dec_model = tree.DecisionTreeRegressor(max_depth=3)
#dec_model.fit(x_train,y_train)
dec_model.fit(one_hot_encoded_training_predictors,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [15]:
print("Training R-Square",dec_model.score(one_hot_encoded_training_predictors,y_train))
print("Testing R-Square",dec_model.score(one_hot_encoded_testing_predictors,y_test))

Training R-Square 0.5023225504106406
Testing R-Square 0.6038418493678969


In [19]:
#!pip install graphviz
#!pip install pydotplus

In [20]:
#With the Decision Tree Regressor model built above, predict the sales for test timeframe


dec_tree_predictions = dec_model.predict(one_hot_encoded_testing_predictors)

from sklearn import metrics
from sklearn.metrics import mean_squared_error
print(np.sqrt(mean_squared_error(y_test, dec_tree_predictions)))

1.9300465164617353


In [17]:
features_new = list()
for col in one_hot_encoded_testing_predictors:
    features_new.append(col)

print(len(features_new))
features_new

29


['snap_TX',
 'snap_TX_lag_1',
 'snap_TX_lag_2',
 'snap_TX_lag_3',
 'day',
 'week',
 'month',
 'year',
 'dayofweek',
 'weekend',
 'lag_28',
 'lag_29',
 'lag_30',
 'lag_31',
 'lag_32',
 'lag_33',
 'lag_34',
 'lag_35',
 'rolling_mean_7',
 'rolling_std_7',
 'rolling_mean_14',
 'rolling_std_14',
 'rolling_mean_30',
 'rolling_std_30',
 'rolling_mean_60',
 'rolling_std_60',
 'cat_id_FOODS',
 'cat_id_HOBBIES',
 'cat_id_HOUSEHOLD']