In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 분석

## Load Data

In [None]:
air_visit_data = pd.read_csv('../input/recruit-restaurant-visitor-forecasting/air_visit_data.csv.zip')
air_store_info = pd.read_csv('../input/recruit-restaurant-visitor-forecasting/air_store_info.csv.zip')
hpg_store_info = pd.read_csv('../input/recruit-restaurant-visitor-forecasting/hpg_store_info.csv.zip')
air_reserve = pd.read_csv('../input/recruit-restaurant-visitor-forecasting/air_reserve.csv.zip')
hpg_reserve = pd.read_csv('../input/recruit-restaurant-visitor-forecasting/hpg_reserve.csv.zip')
store_id_relation = pd.read_csv('../input/recruit-restaurant-visitor-forecasting/store_id_relation.csv.zip')
sample_submission = pd.read_csv('../input/recruit-restaurant-visitor-forecasting/sample_submission.csv.zip')
date_info = pd.read_csv('../input/recruit-restaurant-visitor-forecasting/date_info.csv.zip')

## EDA

In [None]:
air_visit_data.head()

In [None]:
air_store_info.head()

In [None]:
hpg_store_info.head()

In [None]:
hpg_reserve.head()

In [None]:
store_id_relation.head()

In [None]:
sample_submission.head()

In [None]:
date_info.head()

## 시각화

In [None]:
# date time 분리 (date, time, year, month, day)
def split_date_time(df):
    result = df.copy()
    if 'visit_datetime' in result.columns:
        result.loc[:,'visit_datetime'] = pd.to_datetime(result['visit_datetime'])
        result.loc[:,'visit_date']  = [str(d.date()) for d in result['visit_datetime']]       
        result.loc[:,'visit_time']  = [str(d.time()) for d in result['visit_datetime']]             
    if 'reserve_datetime' in df.columns:
        result.loc[:,'reserve_datetime'] = pd.to_datetime(result['reserve_datetime'])
        result.loc[:,'reserve_date']  = [str(d.date()) for d in result['reserve_datetime']]       
        result.loc[:,'reserve_time']  = [str(d.time()) for d in result['reserve_datetime']]        
        
    if 'visit_date' in result.columns:        
        result = pd.merge(result, date_info, how='inner', left_on='visit_date', right_on='calendar_date')
        result.loc[:,'visit_date'] = pd.to_datetime(result['visit_date'])
        result['visit_year'] = result.visit_date.dt.year
        result['visit_month']  = result.visit_date.dt.month
        result['visit_day'] = result.visit_date.dt.day

    if 'reserve_date' in result.columns:        
        result = pd.merge(result, date_info ,how='inner', left_on='reserve_date', right_on='calendar_date')
        result.loc[:,'reserve_date'] = pd.to_datetime(result['reserve_date'])
        result['reserve_year'] = result.reserve_date.dt.year
        result['reserve_month']  = result.reserve_date.dt.month
        result['reserve_day'] = result.reserve_date.dt.day
    return result

### 전체 방문자 추이 확인

In [None]:
air_visit_df = split_date_time(air_visit_data)
air_visit_df.head()

#### 전체 기간

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = air_visit_df, x='visit_date', y='visitors', estimator=sum)
plt.show()

#### 요일별

In [None]:
m = ['Monday', 'Tuesday', 'Thursday', 'Wednesday','Friday', 'Saturday', 'Sunday']
plt.figure(figsize=(20,10))
sns.barplot(data = air_visit_df, x='day_of_week', y='visitors', order=m)
plt.show()

#### 월별

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(data = air_visit_df, x='visit_month', y='visitors')
plt.show()

#### 전체 기간동안 방문자와 예약자 추이

In [None]:
air_reserve_df = split_date_time(air_reserve)
air_reserve_df

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = air_visit_df, x='visit_date', y='visitors', estimator=sum)
sns.lineplot(data = air_reserve_df, x='visit_date', y='reserve_visitors', color='r', estimator=sum)
plt.show()

### 2016년도 추이 확인

### 예약자 수

#### air

In [None]:
air_reserve_df = split_date_time(air_reserve)
air_reserve_df.head()

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = air_reserve_df[air_reserve_df.visit_year==2016], x='visit_date', y='reserve_visitors', estimator=sum)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = air_reserve_df.loc[(air_reserve_df['visit_year'] == 2016) & (air_reserve_df['visit_month'] == 1)], x='visit_date', y='reserve_visitors', estimator=sum)
plt.show()

#### hpg

In [None]:
hpg_reserve_df = split_date_time(hpg_reserve)
hpg_reserve_df.head()

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = hpg_reserve_df[hpg_reserve_df.visit_year==2016], x='visit_date', y='reserve_visitors', estimator=sum)
plt.show()

### 방문자 수

In [None]:
air_visit_df = split_date_time(air_visit_data)
air_visit_df['visit_date'] = pd.to_datetime(air_visit_data['visit_date'])
air_visit_df.head()

In [None]:
air_visit_df.loc[air_visit_df.visit_year==2016]

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = air_visit_df[air_visit_df.visit_year==2016], x='visit_date', y='visitors', estimator=sum)
plt.show()

##### 한달 기간 시각화

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = air_visit_df.loc[(air_visit_df['visit_year'] == 2016) & (air_visit_df['visit_month'] == 1)], x='visit_date', y='visitors', estimator=sum)
plt.show()

##### 1주 기간 시각화

In [None]:
air_visit_df['visit_week'] = air_visit_df['visit_date'].map(lambda x : date.isocalendar()[1])

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = air_visit_df.loc[(air_visit_df['visit_year'] == 2016) & (air_visit_df['visit_month'] == 1) & (air_visit_df['visit_week'] == 2)], x='visit_date', y='visitors', estimator=sum)
plt.show()

# 데이터 전처리

# 모델 학습

# 모델 평가