In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA

In [2]:
train = pd.read_csv('../data/dummy_train.csv')
test = pd.read_csv('../data/dummy_test.csv')

In [3]:
train.head()

Unnamed: 0,row_id,date,num_sold,Finland,Norway,Sweden,KaggleMart,KaggleRama,Kaggle Hat,Kaggle Mug,Kaggle Sticker
0,0,2015-01-01,329,1,0,0,1,0,0,1,0
1,1,2015-01-01,520,1,0,0,1,0,1,0,0
2,2,2015-01-01,146,1,0,0,1,0,0,0,1
3,3,2015-01-01,572,1,0,0,0,1,0,1,0
4,4,2015-01-01,911,1,0,0,0,1,1,0,0


In [4]:
train.dtypes

row_id             int64
date              object
num_sold           int64
Finland            int64
Norway             int64
Sweden             int64
KaggleMart         int64
KaggleRama         int64
Kaggle Hat         int64
Kaggle Mug         int64
Kaggle Sticker     int64
dtype: object

In [5]:
#creating the train and validation set
train_df = train[:int(0.8*(len(train)))].drop(columns=['row_id'])
valid = train[int(0.8*(len(train))):].drop(columns=['row_id'])

In [6]:
train_df.dtypes

date              object
num_sold           int64
Finland            int64
Norway             int64
Sweden             int64
KaggleMart         int64
KaggleRama         int64
Kaggle Hat         int64
Kaggle Mug         int64
Kaggle Sticker     int64
dtype: object

In [7]:
train_df['date'] = pd.to_datetime(train_df['date'])

In [8]:
train_df.dtypes

date              datetime64[ns]
num_sold                   int64
Finland                    int64
Norway                     int64
Sweden                     int64
KaggleMart                 int64
KaggleRama                 int64
Kaggle Hat                 int64
Kaggle Mug                 int64
Kaggle Sticker             int64
dtype: object

In [9]:
np.asarray(train_df)

array([[Timestamp('2015-01-01 00:00:00'), 329, 1, ..., 0, 1, 0],
       [Timestamp('2015-01-01 00:00:00'), 520, 1, ..., 1, 0, 0],
       [Timestamp('2015-01-01 00:00:00'), 146, 1, ..., 0, 0, 1],
       ...,
       [Timestamp('2018-03-14 00:00:00'), 321, 0, ..., 0, 0, 1],
       [Timestamp('2018-03-14 00:00:00'), 247, 0, ..., 0, 1, 0],
       [Timestamp('2018-03-14 00:00:00'), 491, 0, ..., 1, 0, 0]],
      dtype=object)

In [10]:
train_df.set_index('date', inplace= True)

In [11]:
train_df.index = pd.DatetimeIndex(train_df.index).to_period('D')

In [12]:
from statsmodels.tsa.stattools import adfuller
def adf_test(timeseries):
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)
# Call the function and run the test

adf_test(train_df['num_sold'])

Results of Dickey-Fuller Test:
Test Statistic                -9.589357e+00
p-value                        2.055850e-16
#Lags Used                     4.600000e+01
Number of Observations Used    2.099100e+04
Critical Value (1%)           -3.430662e+00
Critical Value (5%)           -2.861678e+00
Critical Value (10%)          -2.566843e+00
dtype: float64


p-value is well below the 5% threshold!  

In [13]:
train_df.head()

Unnamed: 0_level_0,num_sold,Finland,Norway,Sweden,KaggleMart,KaggleRama,Kaggle Hat,Kaggle Mug,Kaggle Sticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-01,329,1,0,0,1,0,0,1,0
2015-01-01,520,1,0,0,1,0,1,0,0
2015-01-01,146,1,0,0,1,0,0,0,1
2015-01-01,572,1,0,0,0,1,0,1,0
2015-01-01,911,1,0,0,0,1,1,0,0


In [14]:
endog = train_df['num_sold']
exog = train_df.drop(columns = 'num_sold')
mod = ARIMA(endog, exog)
res = mod.fit()
print(res.summary())

                               SARIMAX Results                                
Dep. Variable:               num_sold   No. Observations:                21038
Model:                          ARIMA   Log Likelihood             -132282.903
Date:                Wed, 26 Jan 2022   AIC                         264585.807
Time:                        11:08:19   BIC                         264665.348
Sample:                    01-01-2015   HQIC                        264611.766
                         - 03-14-2018                                         
Covariance Type:                  opg                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const            175.5325      0.589    298.078      0.000     174.378     176.687
Finland          -23.3844      1.544    -15.146      0.000     -26.410     -20.358
Norway           169.0311      1.287

In [15]:
res.predict()

TypeError: '>' not supported between instances of 'int' and 'slice'

In [16]:
train_df.shape

(21038, 9)

In [None]:
#alright so now I've got a model that functions!  Now how do I get that to predict my test data.

In [17]:
test

Unnamed: 0,row_id,date,Finland,Norway,Sweden,KaggleMart,KaggleRama,Kaggle Hat,Kaggle Mug,Kaggle Sticker
0,26298,2019-01-01,1,0,0,1,0,0,1,0
1,26299,2019-01-01,1,0,0,1,0,1,0,0
2,26300,2019-01-01,1,0,0,1,0,0,0,1
3,26301,2019-01-01,1,0,0,0,1,0,1,0
4,26302,2019-01-01,1,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...
6565,32863,2019-12-31,0,0,1,1,0,1,0,0
6566,32864,2019-12-31,0,0,1,1,0,0,0,1
6567,32865,2019-12-31,0,0,1,0,1,0,1,0
6568,32866,2019-12-31,0,0,1,0,1,1,0,0


In [20]:
preds.to_csv('./submission.csv')