# Time Series Forecast using ARIMA model
- Let's predict stock of 10ft empty container of Busan Harbor.
- Trained on 2018~2019 data and tested on 2020 Jan ~ 2020 May data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings  
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/monthly-container-holding-of-ports-in-south-korea/container.csv', index_col=0)
data.head()

# Data Integration

In [None]:
data = data[data['Harbor']!='East Sea, Mukho']
data_notKR = data[data['isKorean']==False]
data = data[data['isKorean']==True]
data = pd.merge(data, data_notKR, how='left', on=['Harbor', 'Date'])
data.drop(['isKorean_x', 'isKorean_y'], axis=1, inplace=True)
data.head()

# Forecasing Empty_40 ft inventory of next month
- Train data: Jan 2018 ~ Dec 2019 Empty 40 data of each harbor
- Test data: Jan 2020 Empty 40 data of each harbor

In [None]:
from sklearn.model_selection import GridSearchCV
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults

best_orders = {}
best_scores = {}

orders = []
for i in range(0, 3):
    for j in range(0, 3):
        for y in range(0, 3):
            orders.append((i, j, y))

for harbor in data['Harbor'].unique():
    h_data = data[data['Harbor']==harbor]
    h_data['Empty_40_total'] = (h_data['Empty_40_x'] + h_data['Empty_40_y']).astype('float64')
    series = pd.Series(list(h_data['Empty_40_total']), index=h_data['Date'])
    
    x = series[:'2019-12-31']
    y = series['2020-01-01']
    
    best_score = np.inf
    best_order = (0,0,0)
    
    #Grid Search
    for order in orders:
        model = ARIMA(x, order=order)
        try:
            model = model.fit(trend='nc')
        except:
            continue    
        f = model.forecast(steps=1)
        f = f[0] # forecasted next month
        err = np.abs(y-f)
        if err<best_score:
            best_score = err
            best_order = order
            
    best_orders[harbor] = best_order
    best_scores[harbor] = best_score

In [None]:
print(best_scores.items())

# Evaluation

## Error rate by harbor

In [None]:
err_rate = []
err = []
for scores in best_scores.items():
    y = data[data['Harbor']==scores[0]]
    y = y[y['Date']=='2020-01-01']
    y = y['Empty_40_x']+y['Empty_40_y']
    err.append(scores[1].item())
    error_rate = scores[1]/y * 100
    err_rate.append(error_rate.item())
    print(scores[0], error_rate.item())

## Average ERROR

In [None]:
print('Avg. Error Rate:', np.mean(err_rate))
print('Avg. Error:', np.mean(err))