# ARMAモデルを全店舗に適用して予測

1. データの前処理（集計期間が店舗ごとに異なるため）
   1. 集計が始まる前のVisitorsをすべてNullにする
   2. 集計期間の中で欠損した日付がある場合は0で補填
2. 店舗ごとにARMAモデルを適用
   1. 差分をとる
   2. ARMAで学習
   3. 学習結果で予測
   4. 差分をもとに戻す
3. Submit file を作成し、出力

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.tsa.api as smt
import statsmodels.api as sm
from statsmodels.tsa.arima_model import AR, ARMA, ARIMA
from sklearn import metrics
from IPython.display import  display
sns.set_style('whitegrid')
%matplotlib inline

In [2]:
data_dir = '../data/'

## データの読み込み

In [3]:
# データ読み込み関数
def import_csv(file_name='features_format', data_dir = '../data/', datetime_keys='visit_date', sort_keys=['air_store_id', 'visit_date']):
    df = pd.read_csv(os.path.join(data_dir, file_name + '.csv'), engine='python')
    df[datetime_keys] = pd.to_datetime(df[datetime_keys])
    df = df.sort_values(by=sort_keys)
    return df
    

In [4]:
features_format = import_csv('features_format')
features_format.head()

Unnamed: 0,air_store_id,visit_date,visitors
0,air_00a91d42b08b08d9,2016-01-01,
1,air_00a91d42b08b08d9,2016-01-02,
2,air_00a91d42b08b08d9,2016-01-03,
3,air_00a91d42b08b08d9,2016-01-04,
4,air_00a91d42b08b08d9,2016-01-05,


In [5]:
test_data = import_csv('test_data')
test_data.head()

Unnamed: 0,air_store_id,visit_date,id,visitors
0,air_00a91d42b08b08d9,2017-03-15,air_00a91d42b08b08d9_2017-03-15,42.0
1,air_00a91d42b08b08d9,2017-03-16,air_00a91d42b08b08d9_2017-03-16,39.0
2,air_00a91d42b08b08d9,2017-03-17,air_00a91d42b08b08d9_2017-03-17,37.0
3,air_00a91d42b08b08d9,2017-03-18,air_00a91d42b08b08d9_2017-03-18,4.0
4,air_00a91d42b08b08d9,2017-03-19,air_00a91d42b08b08d9_2017-03-19,


In [40]:
train = pd.concat([features_format, test_data[['air_store_id', 'visit_date', 'visitors']]], axis=0)
train = train.sort_values(by=['air_store_id', 'visit_date'])
train = train.reset_index()
train.head()

Unnamed: 0,index,air_store_id,visit_date,visitors
0,0,air_00a91d42b08b08d9,2016-01-01,
1,1,air_00a91d42b08b08d9,2016-01-02,
2,2,air_00a91d42b08b08d9,2016-01-03,
3,3,air_00a91d42b08b08d9,2016-01-04,
4,4,air_00a91d42b08b08d9,2016-01-05,


## データの前処理（集計期間が店舗ごとに異なるため）

 1. 集計が始まる前のVisitorsをすべてNullにする
 2. 集計期間の中で欠損した日付がある場合は0で補填

In [None]:
air_store_id = 'air_00a91d42b08b08d9'

In [None]:
# air_store_idで抽出
train_by_store = train[train['air_store_id']==air_store_id]

# visitorsの1つ前のレコードを持つ列を追加
train_by_store['visitors_shift_-1'] = train_by_store['visitors'].shift(-1)



In [41]:
# それぞれの店舗を日付順に並べ、
# 前の行が同じ店舗のレコードの場合に、
# 当日がnullで前日もnullなら何もしない
# 当日がnullで前日がnullでないなら0


for i in range(1, len(train)):
    if train['air_store_id'].loc[i] == train['air_store_id'].loc[i-1]:
        if np.isnan(train['visitors'].loc[i]):
            if np.isnan(train['visitors'].loc[i-1]):
                pass
            else:
                train.loc[i,'visitors'] = 0
                    

### 集計の空白期間が長すぎる店舗はないか？

## 店舗ごとにARMAモデルを適用

   1. 差分をとる
   2. ARMAで学習
   3. 学習結果で予測
   4. 差分をもとに戻す

In [42]:
air_store_id = 'air_00a91d42b08b08d9'
order = (6,2)

In [47]:
train_by_store['visit_date'].head(10)

183   2016-07-02
184   2016-07-03
185   2016-07-04
186   2016-07-05
187   2016-07-06
188   2016-07-07
189   2016-07-08
190   2016-07-09
191   2016-07-10
192   2016-07-11
Name: visit_date, dtype: datetime64[ns]

In [48]:
# air_store_idで抽出
train_by_store = train[train['air_store_id']==air_store_id]

# 差分をとる
visitors_diff = train_by_store['visitors'].diff()
visitors_diff.name = 'visitors' + '_diff'

# 差分の列をtrainに結合
train_by_store = pd.concat([train_by_store, visitors_diff], axis=1)

# nullのある行を削除
train_by_store = train_by_store.dropna(axis=0)

# 学習
result = ARMA(train_by_store['visitors_diff'].values, order=order, dates=train_by_store['visit_date']).fit()

# 予測
predicts_diff = result.predict(start='2017-04-23', end='2017-05-31')

# 差分をもとにもどす
last_train_value = train_by_store['visitors'].iloc[-1]
predicts = [predicts_diff[0] + last_train_value]
for i in range(1, len(predicts_diff)):
    predicts.append(predicts_diff[i] + predicts[-1])



In [8]:
def arma_pred(air_store_id, order=(6,2)):

    # air_store_idで抽出
    train_by_store = train[train['air_store_id']==air_store_id]

    # 差分をとる
    visitors_diff = train_by_store['visitors'].diff()
    visitors_diff.name = 'visitors' + '_diff'

    # 差分の列をtrainに結合
    train_by_store = pd.concat([train_by_store, visitors_diff], axis=1)

    # nullのある行を削除
    train_by_store = train_by_store.dropna(axis=0)

    # 学習
    result = ARMA(train_by_store['visitors_diff'].values, order=order, dates=train_by_store['visit_date']).fit()

    # 予測
    predicts_diff = result.predict(start='2017-04-23', end='2017-05-31')

    # 差分をもとにもどす
    last_train_value = train_by_store['visitors'].iloc[-1]
    predicts = [predicts_diff[0] + last_train_value]
    for i in range(1, len(predicts_diff)):
        predicts.append(predicts_diff[i] + predicts[-1])
        
    return predicts

In [34]:
arma_pred('air_00a91d42b08b08d9',order=(4,2))

ValueError: No frequency information was provided with date index and no frequency could be inferred.

In [30]:
predicts_appended = []
num_i = 0
for air_store_id in train['air_store_id'].unique():
    num_i += 1
    air_store_id_i = air_store_id
    predicts_appended.append(arma_pred(air_store_id, order=(6,2)))

ValueError: No frequency information was provided with date index and no frequency could be inferred.

In [29]:
num_i

0

In [31]:
air_store_id_i

'air_00a91d42b08b08d9'

In [12]:
for i in

SyntaxError: invalid syntax (<ipython-input-12-e187c755dce4>, line 1)

In [13]:
predicts_diff = result.predict(start='2017-04-23', end='2017-05-31')
predicts_diff

NameError: name 'result' is not defined

In [14]:
result.summary()

NameError: name 'result' is not defined

submissionデータをインポート

In [None]:
sample_submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'), engine='python')
sample_submission.head()