In [1]:
#for this round, testing "drift" strategy, last 3 years
#importing necessary forecasting tools from sktime
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.naive import NaiveForecaster

#import pandas library for data analysis & manipulation
import pandas as pd

#import pandas library for scientific computing
import numpy as np


#training data for model
train_path = "***\\Project_Data\\train.csv.zip"
#test data for prediction model
test_path = "**\\Project_Data\\test.csv"
#output path
forecast_path = "***\\Project_Models\\sktime_naive_drift\\submission.csv"

#import data into pandas dataframe
train_data = pd.read_csv(train_path, 
                         usecols=["store_nbr", "family", "date", "sales"],
                         dtype={"store_nbr":"int",
                                "family":"object",
                                "sales":"float32",},
                                parse_dates=["date"],
                                infer_datetime_format=True,
                                compression="zip")

test_data = pd.read_csv(test_path, 
                         usecols=["id","store_nbr", "family", "date"],
                         dtype={"id":"int",
                                "store_nbr":"int",
                                "family":"object",
                                "sales":"float32",},
                                parse_dates=["date"],
                                infer_datetime_format=True)
                            
#check number of rows & columns in each set
print("training set",train_data.shape)
print("testing set", test_data.shape)

training set (3000888, 4)
testing set (28512, 4)


In [2]:
#sktime algorithm requires explicit declaration of frequency
train_data["date"] = train_data.date.dt.to_period("D")
test_data["date"] = test_data.date.dt.to_period("D")

In [3]:
#review training data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 4 columns):
 #   Column     Dtype    
---  ------     -----    
 0   date       period[D]
 1   store_nbr  int32    
 2   family     object   
 3   sales      float32  
dtypes: float32(1), int32(1), object(1), period[D](1)
memory usage: 68.7+ MB


In [4]:
#peek at detail of training data
train_data

Unnamed: 0,date,store_nbr,family,sales
0,2013-01-01,1,AUTOMOTIVE,0.000000
1,2013-01-01,1,BABY CARE,0.000000
2,2013-01-01,1,BEAUTY,0.000000
3,2013-01-01,1,BEVERAGES,0.000000
4,2013-01-01,1,BOOKS,0.000000
...,...,...,...,...
3000883,2017-08-15,9,POULTRY,438.132996
3000884,2017-08-15,9,PREPARED FOODS,154.552994
3000885,2017-08-15,9,PRODUCE,2419.729004
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000000


In [5]:
#review test data
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype    
---  ------     --------------  -----    
 0   id         28512 non-null  int32    
 1   date       28512 non-null  period[D]
 2   store_nbr  28512 non-null  int32    
 3   family     28512 non-null  object   
dtypes: int32(2), object(1), period[D](1)
memory usage: 668.4+ KB


In [6]:
test_data.head()

Unnamed: 0,id,date,store_nbr,family
0,3000888,2017-08-16,1,AUTOMOTIVE
1,3000889,2017-08-16,1,BABY CARE
2,3000890,2017-08-16,1,BEAUTY
3,3000891,2017-08-16,1,BEVERAGES
4,3000892,2017-08-16,1,BOOKS


In [7]:
#prepare data
train_data = train_data.sort_values(by=["store_nbr", "family", "date"])

In [8]:
#selecting subset of data
train_sub = train_data.loc[(train_data.date >= "2017-06-28")]
train_sub

Unnamed: 0,date,store_nbr,family,sales
2913570,2017-06-28,1,AUTOMOTIVE,2.0
2915352,2017-06-29,1,AUTOMOTIVE,1.0
2917134,2017-06-30,1,AUTOMOTIVE,11.0
2918916,2017-07-01,1,AUTOMOTIVE,7.0
2920698,2017-07-02,1,AUTOMOTIVE,4.0
...,...,...,...,...
2993627,2017-08-11,54,SEAFOOD,0.0
2995409,2017-08-12,54,SEAFOOD,1.0
2997191,2017-08-13,54,SEAFOOD,2.0
2998973,2017-08-14,54,SEAFOOD,0.0


In [9]:
#sktime forecasting require hierarchical multi-index in pandas dataframe
train_sub.set_index(["store_nbr", "family", "date"], inplace=True)

In [10]:
#checking results
train_sub

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2017-06-28,2.0
1,AUTOMOTIVE,2017-06-29,1.0
1,AUTOMOTIVE,2017-06-30,11.0
1,AUTOMOTIVE,2017-07-01,7.0
1,AUTOMOTIVE,2017-07-02,4.0
...,...,...,...
54,SEAFOOD,2017-08-11,0.0
54,SEAFOOD,2017-08-12,1.0
54,SEAFOOD,2017-08-13,2.0
54,SEAFOOD,2017-08-14,0.0


In [11]:
#checking types
train_sub.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 87318 entries, (1, 'AUTOMOTIVE', Period('2017-06-28', 'D')) to (54, 'SEAFOOD', Period('2017-08-15', 'D'))
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sales   87318 non-null  float32
dtypes: float32(1)
memory usage: 602.3+ KB


In [12]:
#verifying timepoint index landed where expected
train_sub.index.get_level_values(level=-1)

PeriodIndex(['2017-06-28', '2017-06-29', '2017-06-30', '2017-07-01',
             '2017-07-02', '2017-07-03', '2017-07-04', '2017-07-05',
             '2017-07-06', '2017-07-07',
             ...
             '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
             '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
             '2017-08-14', '2017-08-15'],
            dtype='period[D]', name='date', length=87318)

In [13]:
#setting desired forecasting range
horizon = ForecastingHorizon(pd.PeriodIndex(pd.date_range("2017-08-16", periods=16, freq="D")), is_relative=False)
horizon

ForecastingHorizon(['2017-08-16', '2017-08-17', '2017-08-18', '2017-08-19',
             '2017-08-20', '2017-08-21', '2017-08-22', '2017-08-23',
             '2017-08-24', '2017-08-25', '2017-08-26', '2017-08-27',
             '2017-08-28', '2017-08-29', '2017-08-30', '2017-08-31'],
            dtype='period[D]', is_relative=False)

In [14]:
#sktime naive forecaster offers several strategies
forecaster = NaiveForecaster(strategy="drift")

In [15]:
#run prediction
next15 = forecaster.fit(train_sub, fh=horizon).predict()

In [16]:
#check results
next15

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2017-08-16,4.041667
1,AUTOMOTIVE,2017-08-17,4.083333
1,AUTOMOTIVE,2017-08-18,4.125000
1,AUTOMOTIVE,2017-08-19,4.166667
1,AUTOMOTIVE,2017-08-20,4.208333
...,...,...,...
54,SEAFOOD,2017-08-27,3.500000
54,SEAFOOD,2017-08-28,3.541667
54,SEAFOOD,2017-08-29,3.583333
54,SEAFOOD,2017-08-30,3.625000


In [17]:
#resetting as normal pandas frame
next15_now = next15.reset_index()

In [18]:
#looking at my results
next15_now

Unnamed: 0,store_nbr,family,date,sales
0,1,AUTOMOTIVE,2017-08-16,4.041667
1,1,AUTOMOTIVE,2017-08-17,4.083333
2,1,AUTOMOTIVE,2017-08-18,4.125000
3,1,AUTOMOTIVE,2017-08-19,4.166667
4,1,AUTOMOTIVE,2017-08-20,4.208333
...,...,...,...,...
28507,54,SEAFOOD,2017-08-27,3.500000
28508,54,SEAFOOD,2017-08-28,3.541667
28509,54,SEAFOOD,2017-08-29,3.583333
28510,54,SEAFOOD,2017-08-30,3.625000


In [19]:
#looking at my test data again b/c i'm like that...
test_data

Unnamed: 0,id,date,store_nbr,family
0,3000888,2017-08-16,1,AUTOMOTIVE
1,3000889,2017-08-16,1,BABY CARE
2,3000890,2017-08-16,1,BEAUTY
3,3000891,2017-08-16,1,BEVERAGES
4,3000892,2017-08-16,1,BOOKS
...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY
28508,3029396,2017-08-31,9,PREPARED FOODS
28509,3029397,2017-08-31,9,PRODUCE
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES


In [20]:
#adding predictions to test data
test_fc = pd.merge(test_data, next15_now, left_on=["store_nbr", "family", "date"], right_on=["store_nbr", "family", "date"])

In [21]:
#checking results
test_fc

Unnamed: 0,id,date,store_nbr,family,sales
0,3000888,2017-08-16,1,AUTOMOTIVE,4.041667
1,3000889,2017-08-16,1,BABY CARE,0.000000
2,3000890,2017-08-16,1,BEAUTY,3.979167
3,3000891,2017-08-16,1,BEVERAGES,1936.562500
4,3000892,2017-08-16,1,BOOKS,0.000000
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,484.914327
28508,3029396,2017-08-31,9,PREPARED FOODS,170.757657
28509,3029397,2017-08-31,9,PRODUCE,2821.211995
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,160.666667


In [22]:
#formatting for submission
forecast = pd.DataFrame({"id":test_fc.id, "sales": test_fc.sales})

In [23]:
#checking results
forecast

Unnamed: 0,id,sales
0,3000888,4.041667
1,3000889,0.000000
2,3000890,3.979167
3,3000891,1936.562500
4,3000892,0.000000
...,...,...
28507,3029395,484.914327
28508,3029396,170.757657
28509,3029397,2821.211995
28510,3029398,160.666667


In [24]:
#exporting to csv, dropping index
forecast.to_csv(forecast_path, index=False)