In [3]:
import numpy as np
import pandas as pd
import os, time, random

In [4]:
#remove columns which might lead to ordinal behavior
remove_features = ['id','store_id','state_id','release','date','wm_yr_wk','d','sales']

In [5]:
#read the dataset prepared after feature engineering
data_path = 'data.csv'
data = pd.read_csv(data_path)

#Understand the dataframe and datatypes for each column
print(data.info())

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4883327 entries, 0 to 4883326
Data columns (total 47 columns):
id                    object
item_id               object
dept_id               object
cat_id                object
store_id              object
state_id              object
d                     object
sales                 float64
release               int64
wm_yr_wk              int64
date                  object
event_name_1          object
event_type_1          object
event_name_2          object
event_type_2          object
snap_TX               int64
event_name_1_lag_1    object
event_type_1_lag_1    object
event_name_1_lag_2    object
event_type_1_lag_2    object
event_name_1_lag_3    object
event_type_1_lag_3    object
snap_TX_lag_1         float64
snap_TX_lag_2         float64
snap_TX_lag_3         float64
day                   int64
week                  int64
month                 int64
year                  int64
dayofweek             int64
weekend              

In [6]:
#Convert the object columns to datatype 'category'
category_columns=['item_id','dept_id','cat_id','event_name_1','event_type_1','event_name_2','event_type_2','event_name_1_lag_1', 'event_type_1_lag_1',
                   'event_name_1_lag_2', 'event_type_1_lag_2', 'event_name_1_lag_3', 'event_type_1_lag_3']

#convert each category in the list one 
for col in category_columns:
    data[col] = data[col].astype(str)
    data[col].fillna('0',inplace=True)
    data[col] = data[col].astype('category')

#Create a list of Store Ids for which data is considered 
stores_ids = data['store_id']
stores_ids = list(stores_ids.unique())

In [7]:
l1 = list(data.columns.values)

for col in l1:
    if col not in category_columns:
        data[col] = data[col].fillna(0)

In [8]:
#Check if it is indeed 'TX_1', since we choose this specific store for modeling purposes
#due to processing power limitations and to avoid OOM(Out of Memory) Error
stores_ids

#copy the dataframe into new df
df = data.copy()

In [9]:
#create a temporary date column with integer values which denotes day number 
#this is later used for subsetting the data into test/train
df['temp_d'] = pd.to_numeric(data['d'].str[2:])

#Once selected categorical columns are dummy encoded, 
#create list of categorical columns to remove from df 
features = [col for col in list(df) if col not in remove_features]

In [10]:
#List of features that we are finally considering for Modeling
print(len(features))
features

40


['item_id',
 'dept_id',
 'cat_id',
 'event_name_1',
 'event_type_1',
 'event_name_2',
 'event_type_2',
 'snap_TX',
 'event_name_1_lag_1',
 'event_type_1_lag_1',
 'event_name_1_lag_2',
 'event_type_1_lag_2',
 'event_name_1_lag_3',
 'event_type_1_lag_3',
 'snap_TX_lag_1',
 'snap_TX_lag_2',
 'snap_TX_lag_3',
 'day',
 'week',
 'month',
 'year',
 'dayofweek',
 'weekend',
 'lag_28',
 'lag_29',
 'lag_30',
 'lag_31',
 'lag_32',
 'lag_33',
 'lag_34',
 'lag_35',
 'rolling_mean_7',
 'rolling_std_7',
 'rolling_mean_14',
 'rolling_std_14',
 'rolling_mean_30',
 'rolling_std_30',
 'rolling_mean_60',
 'rolling_std_60',
 'temp_d']

In [11]:
#Creating variables for limiting the data by dates
START_TRAIN = 1000      # Start day for training data
END_TRAIN   = 1885      # End day of our train data, 
#28 days after this are left for testing(1886 - Start day for Testing Data)  
LimitData   = 1913      # End day for Testing Data

#Subset the data for 1000 to 1913 days
df = df[(df['temp_d']>=START_TRAIN) & (df['temp_d']<=LimitData)].reset_index(drop=True)


In [12]:
#Create train and test datasets
train_mask = df['temp_d']<=END_TRAIN
#valid_mask = train_mask&(df['temp_d']>(END_TRAIN-P_HORIZON))
preds_mask = df['temp_d']>(END_TRAIN)


train = df[train_mask.values]
test = df[preds_mask.values]

#Split both train and test datasets for independant and depandant variables 
x_train = train[features]
y_train = train[['sales']]

x_test = test[features]
y_test = test[['sales']]


In [13]:
x_train.describe()

Unnamed: 0,snap_TX,snap_TX_lag_1,snap_TX_lag_2,snap_TX_lag_3,day,week,month,year,dayofweek,weekend,...,lag_35,rolling_mean_7,rolling_std_7,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60,temp_d
count,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,...,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0,2597874.0
mean,0.3272537,0.3271925,0.3271306,0.3270798,15.76274,26.41239,6.447003,3.561989,3.006865,0.2868215,...,1.048216,1.047793,0.9158739,1.04779,0.9907993,1.047985,1.060277,1.049045,1.119706,1453.865
std,0.4692108,0.4691883,0.4691655,0.4691468,8.795408,16.19863,3.692978,0.7673356,1.999712,0.4522776,...,3.148365,2.666149,1.534451,2.602818,1.542978,2.553143,1.582501,2.525986,1.655362,252.9782
min,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0
25%,0.0,0.0,0.0,0.0,8.0,11.0,3.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0714,0.2673,0.1333,0.3457,0.1333,0.3992,1239.0
50%,0.0,0.0,0.0,0.0,16.0,26.0,6.0,4.0,3.0,0.0,...,0.0,0.2856,0.5347,0.3572,0.6333,0.3667,0.6787,0.3833,0.7124,1459.0
75%,1.0,1.0,1.0,1.0,23.0,42.0,10.0,4.0,5.0,1.0,...,1.0,1.0,1.134,1.0,1.167,0.967,1.217,0.967,1.255,1672.0
max,1.0,1.0,1.0,1.0,31.0,53.0,12.0,5.0,6.0,1.0,...,323.0,196.6,122.06,190.4,108.3,195.9,82.06,154.4,98.9,1885.0


In [14]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [15]:
#one_hot_encoded_training_predictors = pd.get_dummies(x_train,columns=['type'])

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for col in category_columns:
    encoder.fit(x_train[col])
    x_train[col] = encoder.transform(x_train[col])

for col in category_columns:
    encoder.fit(x_test[col])
    x_test[col] = encoder.transform(x_test[col])

x_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,item_id,dept_id,cat_id,event_name_1,event_type_1,event_name_2,event_type_2,snap_TX,event_name_1_lag_1,event_type_1_lag_1,...,lag_35,rolling_mean_7,rolling_std_7,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60,temp_d
0,1437,3,1,30,4,2,2,0,30,4,...,0.0,0.1428,0.3780,0.0714,0.2673,0.16660,0.4612,0.1500,0.4443,1000
1,1438,3,1,30,4,2,2,0,30,4,...,0.0,0.1428,0.3780,0.0714,0.2673,0.06665,0.2537,0.0500,0.2197,1000
2,1440,3,1,30,4,2,2,0,30,4,...,0.0,1.0000,1.2910,1.1430,1.7480,1.16700,1.6840,1.3660,1.5940,1000
3,1441,3,1,30,4,2,2,0,30,4,...,0.0,1.0000,1.1540,0.6430,0.9287,0.60000,0.8550,0.5835,0.9070,1000
4,1442,3,1,30,4,2,2,0,30,4,...,0.0,0.1428,0.3780,0.2856,0.6113,0.33330,0.6610,0.2167,0.5240,1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2597869,1432,2,0,4,0,2,2,0,30,4,...,0.0,0.5713,0.9760,0.4285,0.7560,0.56700,0.8174,0.7500,1.2020,1885
2597870,1433,2,0,4,0,2,2,0,30,4,...,0.0,0.0000,0.0000,0.0000,0.0000,0.00000,0.0000,0.0000,0.0000,1885
2597871,1434,2,0,4,0,2,2,0,30,4,...,0.0,0.7144,0.9510,0.5713,0.7560,0.43330,0.6787,0.3333,0.6010,1885
2597872,1435,2,0,4,0,2,2,0,30,4,...,0.0,0.5713,0.7866,0.8570,1.1670,0.73340,0.8680,0.8667,0.9650,1885


In [16]:
x_test

Unnamed: 0,item_id,dept_id,cat_id,event_name_1,event_type_1,event_name_2,event_type_2,snap_TX,event_name_1_lag_1,event_type_1_lag_1,...,lag_35,rolling_mean_7,rolling_std_7,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60,temp_d
2597874,1437,3,1,0,0,0,0,0,0,0,...,0.0,0.2856,0.488,0.2856,0.4688,0.23330,0.5040,0.20000,0.5140,1886
2597875,1438,3,1,0,0,0,0,0,0,0,...,0.0,0.0000,0.000,0.0000,0.0000,0.03333,0.1826,0.01666,0.1292,1886
2597876,1439,3,1,0,0,0,0,0,0,0,...,0.0,0.0000,0.000,0.0714,0.2673,0.10000,0.3052,0.10000,0.3025,1886
2597877,1440,3,1,0,0,0,0,0,0,0,...,1.0,1.5710,1.512,0.8570,1.2920,0.40000,0.9683,0.30000,0.8090,1886
2597878,1441,3,1,0,0,0,0,0,0,0,...,1.0,0.8570,1.215,1.0000,0.9610,0.86670,0.8604,0.71700,0.9760,1886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683241,1432,2,0,0,0,0,0,0,0,0,...,0.0,0.0000,0.000,0.0000,0.0000,0.26660,0.5210,0.38330,0.6660,1913
2683242,1433,2,0,0,0,0,0,0,0,0,...,0.0,0.0000,0.000,0.0000,0.0000,0.00000,0.0000,0.00000,0.0000,1913
2683243,1434,2,0,0,0,0,0,0,0,0,...,0.0,0.0000,0.000,0.2856,0.6113,0.40000,0.6750,0.41670,0.6714,1913
2683244,1435,2,0,0,0,0,0,0,0,0,...,1.0,1.2860,0.951,1.0710,0.8286,0.76660,0.7740,0.76660,0.8100,1913


In [17]:
#one_hot_encoded_training_predictors
#Import the necessary libraries for Bagging Regressor

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor

rf = BaggingRegressor(max_features=27,max_samples=0.3,n_estimators=50)
rf.fit(x_train,y_train)
rf_acc = rf.score(x_test,y_test)
print(rf_acc)

  return column_or_1d(y, warn=True)


0.4639742958003834


In [18]:
#With the bagging regressor model built above, predict the sales for test timeframe
bag_predictions = rf.predict(x_test)

In [19]:
len(bag_predictions)

85372

In [20]:
y_test

Unnamed: 0,sales
2597874,1.0
2597875,0.0
2597876,0.0
2597877,0.0
2597878,0.0
...,...
2683241,0.0
2683242,0.0
2683243,0.0
2683244,0.0


In [22]:
shaped_bag_predictions = bag_predictions.reshape(85372,1)
print(np.sqrt(rmse(y_test, shaped_bag_predictions)))

sales    1.49835
dtype: float64
