# Exploring the Data


In [1]:
import pandas as pd

import os
for dirname, _, filenames in os.walk('../Data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../Data/data.md
../Data/holidays_events.csv
../Data/oil.csv
../Data/sample_submission.csv
../Data/stores.csv
../Data/train.csv
../Data/transactions.csv


In [2]:
df = pd.read_csv('../Data/train.csv')

# View the first and last few rows of the data
df.head(-5)


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000878,3000878,2017-08-15,9,MAGAZINES,11.000,0
3000879,3000879,2017-08-15,9,MEATS,449.228,0
3000880,3000880,2017-08-15,9,PERSONAL CARE,522.000,11
3000881,3000881,2017-08-15,9,PET SUPPLIES,6.000,0


In [3]:
# Get summary statistics for the data
df.describe()

Unnamed: 0,id,store_nbr,sales,onpromotion
count,3000888.0,3000888.0,3000888.0,3000888.0
mean,1500444.0,27.5,357.7757,2.60277
std,866281.9,15.58579,1101.998,12.21888
min,0.0,1.0,0.0,0.0
25%,750221.8,14.0,0.0,0.0
50%,1500444.0,27.5,11.0,0.0
75%,2250665.0,41.0,195.8473,0.0
max,3000887.0,54.0,124717.0,741.0


see how many nonzero entries there are

In [4]:
sum(df['sales']!=0)

2061758

## Try random forest

for this we have to masssage the data a bit

In [5]:
df2 = pd.read_csv("../Data/train.csv",
    index_col='id',
    parse_dates=['date'])
df2.head(-5)

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,2013-01-01,1,BABY CARE,0.000,0
2,2013-01-01,1,BEAUTY,0.000,0
3,2013-01-01,1,BEVERAGES,0.000,0
4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...
3000878,2017-08-15,9,MAGAZINES,11.000,0
3000879,2017-08-15,9,MEATS,449.228,0
3000880,2017-08-15,9,PERSONAL CARE,522.000,11
3000881,2017-08-15,9,PET SUPPLIES,6.000,0


Define the target variable (the one we are going to predict) 

In [65]:
y = df2.sales

See which are the features we can use for the prediction (in principle we can use the variables that are numerical)

In [7]:
df2.columns

Index(['date', 'store_nbr', 'family', 'sales', 'onpromotion'], dtype='object')

In [8]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000888 entries, 0 to 3000887
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   date         datetime64[ns]
 1   store_nbr    int64         
 2   family       object        
 3   sales        float64       
 4   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 137.4+ MB


first we need to make the date into an integer or float

In [9]:
# An example
from datetime import datetime
curr_dt = datetime.now()
print("Current datetime: ", curr_dt)
timestamp = int(round(curr_dt.timestamp()))
print("Integer timestamp of current datetime: ",
      timestamp)

Current datetime:  2023-05-16 15:41:55.246650
Integer timestamp of current datetime:  1684244515


In [10]:
# apply to all
df2['date'] = df2['date'].map(lambda a : int(round(a.timestamp())) )

In [11]:
df2.head(-5)

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1356998400,1,AUTOMOTIVE,0.000,0
1,1356998400,1,BABY CARE,0.000,0
2,1356998400,1,BEAUTY,0.000,0
3,1356998400,1,BEVERAGES,0.000,0
4,1356998400,1,BOOKS,0.000,0
...,...,...,...,...,...
3000878,1502755200,9,MAGAZINES,11.000,0
3000879,1502755200,9,MEATS,449.228,0
3000880,1502755200,9,PERSONAL CARE,522.000,11
3000881,1502755200,9,PET SUPPLIES,6.000,0


In [12]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000888 entries, 0 to 3000887
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   date         int64  
 1   store_nbr    int64  
 2   family       object 
 3   sales        float64
 4   onpromotion  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 137.4+ MB


In [26]:
family_enum = {}
family_array = df2['family'].unique()
for i in range(len(family_array)):
    family_enum[family_array[i]] = i
print(family_enum)

{'AUTOMOTIVE': 0, 'BABY CARE': 1, 'BEAUTY': 2, 'BEVERAGES': 3, 'BOOKS': 4, 'BREAD/BAKERY': 5, 'CELEBRATION': 6, 'CLEANING': 7, 'DAIRY': 8, 'DELI': 9, 'EGGS': 10, 'FROZEN FOODS': 11, 'GROCERY I': 12, 'GROCERY II': 13, 'HARDWARE': 14, 'HOME AND KITCHEN I': 15, 'HOME AND KITCHEN II': 16, 'HOME APPLIANCES': 17, 'HOME CARE': 18, 'LADIESWEAR': 19, 'LAWN AND GARDEN': 20, 'LINGERIE': 21, 'LIQUOR,WINE,BEER': 22, 'MAGAZINES': 23, 'MEATS': 24, 'PERSONAL CARE': 25, 'PET SUPPLIES': 26, 'PLAYERS AND ELECTRONICS': 27, 'POULTRY': 28, 'PREPARED FOODS': 29, 'PRODUCE': 30, 'SCHOOL AND OFFICE SUPPLIES': 31, 'SEAFOOD': 32}


In [32]:
df2['family_number'] = df2['family'].map(lambda a : family_enum[a] )
df2.head()

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion,family_number
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1356998400,1,AUTOMOTIVE,0.0,0,0
1,1356998400,1,BABY CARE,0.0,0,1
2,1356998400,1,BEAUTY,0.0,0,2
3,1356998400,1,BEVERAGES,0.0,0,3
4,1356998400,1,BOOKS,0.0,0,4


lets see if we can make "family" into a number (todo)

In [64]:
features = ['date', 'store_nbr', 'onpromotion', 'family_number']
x = df2[features]
print(x)

               date  store_nbr  onpromotion  family_number
id                                                        
0        1356998400          1            0              0
1        1356998400          1            0              1
2        1356998400          1            0              2
3        1356998400          1            0              3
4        1356998400          1            0              4
...             ...        ...          ...            ...
3000883  1502755200          9            0             28
3000884  1502755200          9            1             29
3000885  1502755200          9          148             30
3000886  1502755200          9            8             31
3000887  1502755200          9            0             32

[3000888 rows x 4 columns]


In [61]:

from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
tree_model = DecisionTreeRegressor(random_state=1)

# Fit model
tree_model.fit(x, y)

DecisionTreeRegressor(random_state=1)

### Test prediction

In [62]:
print("Making predictions for the following 15 items:")
print(df2.iloc[-150:-135,:])
print("The predictions are")
prediction = tree_model.predict(x.iloc[-150:-135,:])
print(prediction)

Making predictions for the following 15 items:
               date  store_nbr                   family    sales  onpromotion  \
id                                                                              
3000738  1502755200         54       HOME AND KITCHEN I   24.000            0   
3000739  1502755200         54      HOME AND KITCHEN II   19.000            0   
3000740  1502755200         54          HOME APPLIANCES    0.000            0   
3000741  1502755200         54                HOME CARE  202.000            7   
3000742  1502755200         54               LADIESWEAR    0.000            0   
3000743  1502755200         54          LAWN AND GARDEN    0.000            0   
3000744  1502755200         54                 LINGERIE    4.000            0   
3000745  1502755200         54         LIQUOR,WINE,BEER  210.000            2   
3000746  1502755200         54                MAGAZINES    2.000            0   
3000747  1502755200         54                    MEATS   57.8

# Calculate Success rate

In [63]:
data = df2.iloc[-150:-135,3]
# data = data.reset_index(drop=True)
print(data)
print(prediction)

eff = (prediction - data)/data
print("Efficiency: (expressed as 100% distance from expected value) " , eff*100)

id
3000738     24.000
3000739     19.000
3000740      0.000
3000741    202.000
3000742      0.000
3000743      0.000
3000744      4.000
3000745    210.000
3000746      2.000
3000747     57.842
3000748    169.000
3000749      0.000
3000750      2.000
3000751     59.619
3000752     94.000
Name: sales, dtype: float64
[ 19.      0.    134.     11.      0.      0.     32.      0.    110.07
 110.      0.      2.    147.044   9.    129.609]
Efficiency: (expressed as 100% distance from expected value)  id
3000738     -20.833333
3000739    -100.000000
3000740            inf
3000741     -94.554455
3000742            NaN
3000743            NaN
3000744     700.000000
3000745    -100.000000
3000746    5403.500000
3000747      90.173231
3000748    -100.000000
3000749            inf
3000750    7252.200000
3000751     -84.904141
3000752      37.881915
Name: sales, dtype: float64


without family: out of 15 predictions: 3 predictions are spot-on, 2 are less than 50% off, the rest are off by up to 1000%
with family: all spot on

does it makes sense to train on the whole training set and to test it on that? does random forest select a subset to train, can we tell it to save a subset for the testing? 

now to move onto the test set...

In [53]:
testdf = pd.read_csv('../Data/test.csv')
testdf.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
