# ARMAモデルを全店舗に適用して予測

1. データの前処理（集計期間が店舗ごとに異なるため）
   1. 集計が始まる前のVisitorsをすべてNullにする
   2. 集計期間の中で欠損した日付がある場合は0で補填
2. 店舗ごとにARMAモデルを適用
   1. 差分をとる
   2. ARMAで学習
   3. 学習結果で予測
   4. 差分をもとに戻す
3. Submit file を作成し、出力

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.tsa.api as smt
import statsmodels.api as sm
from statsmodels.tsa.arima_model import AR, ARMA, ARIMA
from sklearn import metrics
from IPython.display import  display
sns.set_style('whitegrid')
%matplotlib inline

In [4]:
data_dir = '../data/'

## データの読み込み

In [51]:
# データ読み込み関数
def import_csv(file_name='features_format', data_dir = '../data/', datetime_keys='visit_date', sort_keys=['air_store_id', 'visit_date']):
    df = pd.read_csv(os.path.join(data_dir, file_name + '.csv'), engine='python')
    df[datetime_keys] = pd.to_datetime(df[datetime_keys])
    df = df.sort_values(by=sort_keys)
    return df
    

In [53]:
features_format = import_csv('features_format')
features_format.head()

Unnamed: 0,air_store_id,visit_date,visitors
0,air_00a91d42b08b08d9,2016-01-01,
1,air_00a91d42b08b08d9,2016-01-02,
2,air_00a91d42b08b08d9,2016-01-03,
3,air_00a91d42b08b08d9,2016-01-04,
4,air_00a91d42b08b08d9,2016-01-05,


In [57]:
test_data = import_csv('test_data')
test_data.head()

Unnamed: 0,air_store_id,visit_date,id,visitors
0,air_00a91d42b08b08d9,2017-03-15,air_00a91d42b08b08d9_2017-03-15,42.0
1,air_00a91d42b08b08d9,2017-03-16,air_00a91d42b08b08d9_2017-03-16,39.0
2,air_00a91d42b08b08d9,2017-03-17,air_00a91d42b08b08d9_2017-03-17,37.0
3,air_00a91d42b08b08d9,2017-03-18,air_00a91d42b08b08d9_2017-03-18,4.0
4,air_00a91d42b08b08d9,2017-03-19,air_00a91d42b08b08d9_2017-03-19,


In [67]:
train = pd.concat([features_format, test_data[['air_store_id', 'visit_date', 'visitors']]], axis=0)
train = train.sort_values(by=['air_store_id', 'visit_date'])
train.head()

Unnamed: 0,air_store_id,visit_date,visitors
0,air_00a91d42b08b08d9,2016-01-01,
1,air_00a91d42b08b08d9,2016-01-02,
2,air_00a91d42b08b08d9,2016-01-03,
3,air_00a91d42b08b08d9,2016-01-04,
4,air_00a91d42b08b08d9,2016-01-05,


## データの前処理（集計期間が店舗ごとに異なるため）

 1. 集計が始まる前のVisitorsをすべてNullにする
 2. 集計期間の中で欠損した日付がある場合は0で補填

In [68]:
# それぞれの店舗を日付順に並べ、
# 前の行が同じ店舗のレコードの場合に、
# 当日がnullで前日もnullなら何もしない
# 当日がnullで前日がnullでないなら0

for i in range(1, len(raw_train)):
    if train['air_store_id'].iloc[i] == train['air_store_id'].iloc[i-1]:
        if train['visitors'].iloc[i] == np.NaN:
            if train['visitors'].iloc[i-1] != np.NaN:
                train['visitors'].iloc[i] = 0
                


In [35]:
air_visit_data.query('visit_date < "2016-07-01"').groupby('air_store_id').count().sort_values(by='visitors')

Unnamed: 0_level_0,visit_date,visitors
air_store_id,Unnamed: 1_level_1,Unnamed: 2_level_1
air_d98380a4aeb0290b,1,1
air_df554c4527a1cfe6,1,1
air_2a485b92210c98b5,11,11
air_a9133955abccf071,24,24
air_0382c794b73b51ad,35,35
air_8d50c64692322dff,40,40
air_764f71040a413d4d,46,46
air_f2c5a1f24279c531,66,66
air_b259b4e4a51a690d,67,67
air_1033310359ceeac1,74,74


### 集計の空白期間が長すぎる店舗はないか？

submissionデータをインポート

In [14]:
sample_submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'), engine='python')
sample_submission.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,0
1,air_00a91d42b08b08d9_2017-04-24,0
2,air_00a91d42b08b08d9_2017-04-25,0
3,air_00a91d42b08b08d9_2017-04-26,0
4,air_00a91d42b08b08d9_2017-04-27,0
