In [66]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,accuracy_score,r2_score

from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [3]:
train = pd.read_csv("train_data.csv")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 982644 entries, 0 to 982643
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Store          982644 non-null  int64 
 1   DayOfWeek      982644 non-null  int64 
 2   Date           982644 non-null  object
 3   Sales          982644 non-null  int64 
 4   Customers      982644 non-null  int64 
 5   Open           982644 non-null  int64 
 6   Promo          982644 non-null  int64 
 7   StateHoliday   982644 non-null  object
 8   SchoolHoliday  982644 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 67.5+ MB


In [5]:
train.memory_usage().sum() / 1024**2

67.47293090820312

In [6]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [7]:
train = import_data("train_data.csv")

Memory usage of dataframe is 67.47 MB
Memory usage after optimization is: 14.10 MB
Decreased by 79.1%


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 982644 entries, 0 to 982643
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   Store          982644 non-null  int16   
 1   DayOfWeek      982644 non-null  int8    
 2   Date           982644 non-null  category
 3   Sales          982644 non-null  int32   
 4   Customers      982644 non-null  int16   
 5   Open           982644 non-null  int8    
 6   Promo          982644 non-null  int8    
 7   StateHoliday   982644 non-null  category
 8   SchoolHoliday  982644 non-null  int8    
dtypes: category(2), int16(2), int32(1), int8(4)
memory usage: 14.1 MB


In [9]:
train.memory_usage(index=False)

Store            1965288
DayOfWeek         982644
Date             2005640
Sales            3930576
Customers        1965288
Open              982644
Promo             982644
StateHoliday      982856
SchoolHoliday     982644
dtype: int64

In [10]:
test = import_data("test_data_hidden.csv")

Memory usage of dataframe is 2.37 MB
Memory usage after optimization is: 0.40 MB
Decreased by 83.3%


In [11]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,2,2015-06-30,5735,568,1,1,0,0
1,2,2,2015-06-30,9863,877,1,1,0,0
2,3,2,2015-06-30,13261,1072,1,1,0,1
3,4,2,2015-06-30,13106,1488,1,1,0,0
4,5,2,2015-06-30,6635,645,1,1,0,0


In [12]:
test.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [13]:
train['year'] = pd.to_datetime(train['Date']).dt.year
test['year'] = pd.to_datetime(test['Date']).dt.year

In [14]:
train = train[train['year'] != 2013]

In [15]:
test.isnull().sum()

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
year             0
dtype: int64

In [16]:
train.describe()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,year
count,575670.0,575670.0,575670.0,575670.0,575670.0,575670.0,575670.0,575670.0
mean,558.760074,4.000938,5833.171649,635.404732,0.827316,0.387479,0.160019,2014.350574
std,321.936094,2.000236,3881.482393,465.833493,0.377974,0.487175,0.366624,0.47715
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2014.0
25%,281.0,2.0,3775.0,409.0,1.0,0.0,0.0,2014.0
50%,558.0,4.0,5827.0,614.0,1.0,0.0,0.0,2014.0
75%,838.0,6.0,7957.0,841.0,1.0,1.0,0.0,2015.0
max,1115.0,7.0,41551.0,5494.0,1.0,1.0,1.0,2015.0


In [17]:
train.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'year'],
      dtype='object')

In [18]:
for column in train:
    if column != 'Date':
        print(column,'\n','Unique values: ', train[column].unique(),'\n', 
              'number of unique values', train[column].nunique(), '\n', '-'*100)

Store 
 Unique values:  [   1    2    3 ... 1113 1114 1115] 
 number of unique values 1115 
 ----------------------------------------------------------------------------------------------------
DayOfWeek 
 Unique values:  [2 1 7 6 5 4 3] 
 number of unique values 7 
 ----------------------------------------------------------------------------------------------------
Sales 
 Unique values:  [ 5735  9863 13261 ... 16841 19794 21614] 
 number of unique values 19754 
 ----------------------------------------------------------------------------------------------------
Customers 
 Unique values:  [ 568  877 1072 ... 3872 3052 2709] 
 number of unique values 3849 
 ----------------------------------------------------------------------------------------------------
Open 
 Unique values:  [1 0] 
 number of unique values 2 
 ----------------------------------------------------------------------------------------------------
Promo 
 Unique values:  [1 0] 
 number of unique values 2 
 ------------

In [19]:
train["StateHoliday"] = train["StateHoliday"].replace({"0":0, "a":1, "b":2, "c":3})

train["Month"]= pd.to_datetime(train["Date"]).dt.month
train["Day"]= pd.to_datetime(train["Date"]).dt.day

train.drop(['Date', 'Customers'], axis=1, inplace=True)

In [20]:
train.head()

Unnamed: 0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,year,Month,Day
0,1,2,5735,1,1,0,0,2015,6,30
1,2,2,9863,1,1,0,0,2015,6,30
2,3,2,13261,1,1,0,1,2015,6,30
3,4,2,13106,1,1,0,0,2015,6,30
4,5,2,6635,1,1,0,0,2015,6,30


In [21]:
test["StateHoliday"] = test["StateHoliday"].replace({"0":0, "a":1, "b":2, "c":3})

test["Month"]= pd.to_datetime(test["Date"]).dt.month
test["Day"]= pd.to_datetime(test["Date"]).dt.day

test.drop(['Date', 'Customers'], axis=1, inplace=True)

In [22]:
train = reduce_mem_usage(train)

Memory usage of dataframe is 27.45 MB
Memory usage after optimization is: 12.63 MB
Decreased by 54.0%


In [23]:
train_dummies = pd.get_dummies(data=train, columns=['Store'])

In [24]:
train_dummies.memory_usage().sum() / 1024**2
#before reduce_size:- 623.11692237854 

623.6659240722656

In [25]:
test = reduce_mem_usage(test)

Memory usage of dataframe is 1.09 MB
Memory usage after optimization is: 0.43 MB
Decreased by 60.6%


In [26]:
test_dummies = pd.get_dummies(data=test, columns=['Store'])

In [27]:
test_dummies.memory_usage().sum() / 1024**2

37.11730766296387

In [28]:
train_dummies.head()

Unnamed: 0,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,year,Month,Day,Store_1,...,Store_1106,Store_1107,Store_1108,Store_1109,Store_1110,Store_1111,Store_1112,Store_1113,Store_1114,Store_1115
0,2,5735,1,1,0,0,2015,6,30,1,...,0,0,0,0,0,0,0,0,0,0
1,2,9863,1,1,0,0,2015,6,30,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13261,1,1,0,1,2015,6,30,0,...,0,0,0,0,0,0,0,0,0,0
3,2,13106,1,1,0,0,2015,6,30,0,...,0,0,0,0,0,0,0,0,0,0
4,2,6635,1,1,0,0,2015,6,30,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
train_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 575670 entries, 0 to 575669
Columns: 1124 entries, DayOfWeek to Store_1115
dtypes: int16(1), int32(1), int8(7), uint8(1115)
memory usage: 623.7 MB


In [30]:
train_dummies.memory_usage(index=False)

DayOfWeek        575670
Sales           2302680
Open             575670
Promo            575670
StateHoliday     575670
                 ...   
Store_1111       575670
Store_1112       575670
Store_1113       575670
Store_1114       575670
Store_1115       575670
Length: 1124, dtype: int64

In [31]:
test_dummies.head()

Unnamed: 0,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,year,Month,Day,Store_1,...,Store_1106,Store_1107,Store_1108,Store_1109,Store_1110,Store_1111,Store_1112,Store_1113,Store_1114,Store_1115
0,5,5263,1,1,0,1,2015,7,31,1,...,0,0,0,0,0,0,0,0,0,0
1,5,6064,1,1,0,1,2015,7,31,0,...,0,0,0,0,0,0,0,0,0,0
2,5,8314,1,1,0,1,2015,7,31,0,...,0,0,0,0,0,0,0,0,0,0
3,5,13995,1,1,0,1,2015,7,31,0,...,0,0,0,0,0,0,0,0,0,0
4,5,4822,1,1,0,1,2015,7,31,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
train_dummies.shape

(575670, 1124)

In [33]:
test_dummies.shape

(34565, 1124)

In [34]:
test_dummies.head()

Unnamed: 0,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,year,Month,Day,Store_1,...,Store_1106,Store_1107,Store_1108,Store_1109,Store_1110,Store_1111,Store_1112,Store_1113,Store_1114,Store_1115
0,5,5263,1,1,0,1,2015,7,31,1,...,0,0,0,0,0,0,0,0,0,0
1,5,6064,1,1,0,1,2015,7,31,0,...,0,0,0,0,0,0,0,0,0,0
2,5,8314,1,1,0,1,2015,7,31,0,...,0,0,0,0,0,0,0,0,0,0
3,5,13995,1,1,0,1,2015,7,31,0,...,0,0,0,0,0,0,0,0,0,0
4,5,4822,1,1,0,1,2015,7,31,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
X_train = train_dummies.drop('Sales', axis=1)
y_train = train_dummies['Sales']

X_test = test_dummies.drop('Sales', axis=1)
y_test = test_dummies['Sales']

In [36]:
X_test.head()

Unnamed: 0,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,year,Month,Day,Store_1,Store_2,...,Store_1106,Store_1107,Store_1108,Store_1109,Store_1110,Store_1111,Store_1112,Store_1113,Store_1114,Store_1115
0,5,1,1,0,1,2015,7,31,1,0,...,0,0,0,0,0,0,0,0,0,0
1,5,1,1,0,1,2015,7,31,0,1,...,0,0,0,0,0,0,0,0,0,0
2,5,1,1,0,1,2015,7,31,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,1,1,0,1,2015,7,31,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,1,0,1,2015,7,31,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
y_test.head()

0     5263
1     6064
2     8314
3    13995
4     4822
Name: Sales, dtype: int16

In [38]:
lr = LinearRegression()
lr.fit(X_train, y_train)

# train_pred = lr.predict(X_train)
# print('R-squared error for training set is: ', r2_score(train_pred, y_train))
# print('RMSE for training set is: ', mean_squared_error( train_pred, y_train, squared=False))
# print('MAE for training set is: ',mean_absolute_error(train_pred, y_train))

test_pred = lr.predict(X_test)
print('R-squared error for testing set is: ', r2_score(test_pred, y_test))
print('RMSE for testing set is: ', mean_squared_error( test_pred, y_test, squared=False))
print('MAE for testing set is: ',mean_absolute_error(test_pred, y_test))


R-squared error for testing set is:  0.8102641491626623
RMSE for testing set is:  1437.7624780773178
MAE for testing set is:  1075.0563887783885


In [43]:
def predict_price(DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,year,Month,Day,Store):    
    loc_index = np.where(X_test.columns==Store)[0][0]
    
    x = np.zeros(len(X_test.columns))
    x[0] = DayOfWeek
    x[1] = Open
    x[2] = Promo
    x[3] = StateHoliday
    x[4] = SchoolHoliday
    x[5] = year
    x[6] = Month
    x[7] = Day
    
    if loc_index >= 0:
        x[loc_index] = 1

    return lr.predict([x])[0]

In [44]:
predict_price(5,1,1,0,1,2015,7,31,'Store_1')

6403.94140625

In [45]:
# R-squared error for training set is:  0.7833026542985042
# RMSE for training set is:  1638.2451667805224
# MAE for training set is:  1168.6414348172564
# R-squared error for testing set is:  0.8102641491626623
# RMSE for testing set is:  1437.7624780773178
# MAE for testing set is:  1075.0563887783885

In [62]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [64]:
ridge = Ridge()

ridge.fit(X_train, y_train)
test_pred = ridge.predict(X_test)
print('R-squared error for testing set is: ', r2_score(test_pred, y_test))
print('RMSE for testing set is: ', mean_squared_error(test_pred, y_test, squared=False))
print('MAE for testing set is: ',mean_absolute_error(test_pred, y_test))

R-squared error for testing set is:  0.7938452500644303
RMSE for testing set is:  1105.3855200385728
MAE for testing set is:  870.4635696763534


In [67]:
# lasso = Lasso()
# lasso.fit(X_train, y_train)
# test_pred = lasso.predict(X_test)
# print('R-squared error for testing set is: ', r2_score(test_pred, y_test))
# print('RMSE for testing set is: ', mean_squared_error(test_pred, y_test, squared=False))
# print('MAE for testing set is: ',mean_absolute_error(test_pred, y_test))

In [68]:
# el = ElasticNet()
# el.fit(X_train, y_train)
# test_pred = el.predict(X_test)
# print('R-squared error for testing set is: ', r2_score(test_pred, y_test))
# print('RMSE for testing set is: ', mean_squared_error(test_pred, y_test, squared=False))
# print('MAE for testing set is: ',mean_absolute_error(test_pred, y_test))

In [52]:
Y_pred = np.zeros(test.shape[0])
train_store = train.groupby(['Store'])
test_store = test.groupby(['Store'])
    
for i in range(1,1116):
    a = train_store.get_group(i)
    b = test_store.get_group(i)
    
    X_train = a.drop(['Sales','Store'],axis=1)
    X_test = b.drop(['Sales','Store'],axis=1)
    y_train = a['Sales']
    y_test = b['Sales']
    
    ridge.fit(X_train, y_train)
    test_pred = ridge.predict(X_test)
    
    i = 0
    for j in b.index:
        Y_pred[j] = test_pred[i]
        i+=1
        
print('R-squared error for testing set is: ', r2_score(test_pred, y_test))
print('RMSE for testing set is: ', mean_squared_error(test_pred, y_test, squared=False))
print('MAE for testing set is: ',mean_absolute_error(test_pred, y_test))

R-squared error for testing set is:  0.7938452500644303
RMSE for testing set is:  1105.3855200385728
MAE for testing set is:  870.4635696763534


In [56]:
train2 = train[train['Sales'] != 0]
test2 = test[test['Sales']!=0]

train2 = pd.get_dummies(data=train2, columns=['Store'], drop_first=True)
test2 = pd.get_dummies(data=test2, columns=['Store'], drop_first=True)

In [57]:
X_train2 = train2.drop('Sales', axis=1)
y_train2 = train2['Sales']

X_test2 = test2.drop('Sales', axis=1)
y_test2 = test['Sales']

lr.fit(X_train2, y_train2)
test_pred = lr.predict(X_test2)
print('R-squared error for testing set is: ', r2_score(test_pred, y_test2))
print('RMSE for testing set is: ', mean_squared_error(test_pred, y_test2, squared=False))
print('MAE for testing set is: ',mean_absolute_error(test_pred, y_test2))

R-squared error for testing set is:  0.7641565213034062
RMSE for testing set is:  1298.4947577653597
MAE for testing set is:  984.9644907537763


In [65]:
train2 = train[train['Sales'] != 0]
test2 = test[test['Sales']!=0]

Y_pred = np.zeros(test.shape[0])
train_store = train2.groupby(['Store'])
test_store = test2.groupby(['Store'])
    
for i in range(1, 1116):
    a = train_store.get_group(i)
    b = test_store.get_group(i)
    
    X_train = a.drop(['Sales'],axis=1)
    X_test = b.drop(['Sales'],axis=1)
    y_train = a['Sales']
    y_test = b['Sales']
    
    dt_.fit(X_train, y_train)
    test_pred = lr.predict(X_test)
    
    i = 0
    for j in b.index:
        Y_pred[j] = test_pred[i]
        i+=1
        
print('R-squared error for testing set is: ', r2_score(test_pred, y_test))
print('RMSE for testing set is: ', mean_squared_error(test_pred, y_test, squared=False))
print('MAE for testing set is: ',mean_absolute_error(test_pred, y_test))


R-squared error for testing set is:  -0.466084493020535
RMSE for testing set is:  1249.2148273903058
MAE for testing set is:  1015.0478028387124


In [67]:
dt = DecisionTreeRegressor()

In [70]:
train2 = train[train['Sales'] != 0]
test2 = test[test['Sales']!=0]

Y_pred = np.zeros(test.shape[0])
train_store = train2.groupby(['Store'])
test_store = test2.groupby(['Store'])
    
for i in range(1, 1116):
    a = train_store.get_group(i)
    b = test_store.get_group(i)
    
    X_train = a.drop(['Sales'],axis=1)
    X_test = b.drop(['Sales'],axis=1)
    y_train = a['Sales']
    y_test = b['Sales']
    
    dt.fit(X_train, y_train)
    test_pred = dt.predict(X_test)
    
    i = 0
    for j in b.index:
        Y_pred[j] = test_pred[i]
        i+=1
        
print('R-squared error for testing set is: ', r2_score(test_pred, y_test))
print('RMSE for testing set is: ', mean_squared_error(test_pred, y_test, squared=False))
print('MAE for testing set is: ',mean_absolute_error(test_pred, y_test))


R-squared error for testing set is:  0.2616423056849927
RMSE for testing set is:  1616.0896564788038
MAE for testing set is:  1251.7037037037037


In [71]:
rf = RandomForestRegressor()

In [72]:
train2 = train[train['Sales'] != 0]
test2 = test[test['Sales']!=0]

Y_pred = np.zeros(test.shape[0])
train_store = train2.groupby(['Store'])
test_store = test2.groupby(['Store'])
    
for i in range(1, 1116):
    a = train_store.get_group(i)
    b = test_store.get_group(i)
    
    X_train = a.drop(['Sales'],axis=1)
    X_test = b.drop(['Sales'],axis=1)
    y_train = a['Sales']
    y_test = b['Sales']
    
    rf.fit(X_train, y_train)
    test_pred = rf.predict(X_test)
    
    i = 0
    for j in b.index:
        Y_pred[j] = test_pred[i]
        i+=1
        
print('R-squared error for testing set is: ', r2_score(test_pred, y_test))
print('RMSE for testing set is: ', mean_squared_error(test_pred, y_test, squared=False))
print('MAE for testing set is: ',mean_absolute_error(test_pred, y_test))


R-squared error for testing set is:  0.5884776041350588
RMSE for testing set is:  1039.757740628218
MAE for testing set is:  778.4166666666666


In [112]:
X_train

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,year,Month,Day
1114,1115,2,1,1,0,0,2015,6,30
2229,1115,1,1,1,0,0,2015,6,29
4459,1115,6,1,0,0,0,2015,6,27
5574,1115,5,1,0,0,0,2015,6,26
6689,1115,4,1,0,0,0,2015,6,25
...,...,...,...,...,...,...,...,...,...
568979,1115,2,1,1,0,1,2014,1,7
570094,1115,1,1,1,0,1,2014,1,6
572324,1115,6,1,0,0,1,2014,1,4
573439,1115,5,1,0,0,1,2014,1,3


In [109]:
test_pred

array([9238.88, 8543.48, 7945.77, 7767.89, 9615.16, 7415.28, 5765.57,
       5198.43, 5167.87, 4900.98, 5130.54, 7621.51, 7876.69, 7402.19,
       7492.91, 7526.61, 9191.05, 7745.99, 5634.37, 4964.66, 5239.59,
       5444.72, 5673.94, 8939.1 , 9347.42, 9396.01, 9248.96])

In [111]:
y_test

1114      8680
2229      8405
3344      7661
4459      8093
5574     10712
7804      6897
8919      5816
10034     6150
11149     5342
12264     5074
13379     6083
15609     7264
16724     7874
17839     6590
18954     6039
20069     7562
21184    10598
23414     7164
24529     5844
25644     5686
26759     5900
27874     5138
28989     6501
31219     6543
32334     7412
33449     6858
34564     7701
Name: Sales, dtype: int16

In [93]:
test2.head(5)

Unnamed: 0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,year,Month,Day
0,1,5,5263,1,1,0,1,2015,7,31
1,2,5,6064,1,1,0,1,2015,7,31
2,3,5,8314,1,1,0,1,2015,7,31
3,4,5,13995,1,1,0,1,2015,7,31
4,5,5,4822,1,1,0,1,2015,7,31


In [88]:
X_train[:5].values

array([[1115,    2,    1,    1,    0,    0, 2015,    6,   30],
       [1115,    1,    1,    1,    0,    0, 2015,    6,   29],
       [1115,    6,    1,    0,    0,    0, 2015,    6,   27],
       [1115,    5,    1,    0,    0,    0, 2015,    6,   26],
       [1115,    4,    1,    0,    0,    0, 2015,    6,   25]],
      dtype=int16)

In [89]:
rf.predict([[1115,    4,    1,    0,    0,    0, 2015,    6,   25]])

array([5092.96])

In [87]:
y_train

1114       8610
2229      11006
4459       6676
5574       5549
6689       5015
          ...  
568979     6242
570094     8536
572324     5586
573439     5050
574554     5657
Name: Sales, Length: 451, dtype: int32

In [91]:
train.head(3)

Unnamed: 0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,year,Month,Day
0,1,2,5735,1,1,0,0,2015,6,30
1,2,2,9863,1,1,0,0,2015,6,30
2,3,2,13261,1,1,0,1,2015,6,30


In [102]:
train[:3].values

array([[    1,     2,  5735,     1,     1,     0,     0,  2015,     6,
           30],
       [    2,     2,  9863,     1,     1,     0,     0,  2015,     6,
           30],
       [    3,     2, 13261,     1,     1,     0,     1,  2015,     6,
           30]], dtype=int32)

In [104]:
rf.predict([[    3,     2,     1,     1,     0,     1,  2015,     6,
           30]])

array([8526.13])