## Add data를 활용해 'Daily Climate Timeseries data' 데이터 불러오기

In [None]:
import pandas as pd
df = pd.read_csv(r'../input/daily-climate-time-series-data/DailyDelhiClimateTrain.csv')
df.index = pd.to_datetime(df['date'])
df = df.iloc[:,[1]]

 # 일단위 데이터를 월 단위로 바꾸는 코드
df = df.to_period('M')
df = df.groupby(df.index).sum()
df.index =  df.index.to_timestamp()
df = df.iloc[:-1,:]
df

# 위 데이터를 활용해 holtwinters exponential smoothing의 trend, seasonal 파라미터를 조정하며 9가지 경우에 대해 모델을 구축한 후 test 데이터에 대해 예측 plot과 r2_score 구하기

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.tsa.seasonal import seasonal_decompose # 위 markdown 수식에 따라 원본 데이터를 추세, 계절성, 잔차로 분해
result = seasonal_decompose(df, model='multiplicative',period=12)
plt.rcParams['figure.figsize'] = [12, 8]
result.plot(observed=True, seasonal=True, trend=True, resid=True)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size = 0.2, shuffle = False) # 중요 shuffle = False

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing
trends = [None, 'add', 'mul']
seasonals = [None, 'add', 'mul']
ESmodels = []
titles=[]
for i in trends:
    for j in seasonals:
        print(i,j)
        titles.append((i,j)) # 그래프를 그리기 위해 trend와 seasonal을 어떤 변수로 넣었는지 확인하기 위해
        ESmodels.append(ExponentialSmoothing(train_data, trend=i, seasonal= j, freq = 'MS').fit())

In [None]:
test_data

In [None]:
from sklearn.metrics import r2_score
predicted_values = []
r2_scores = []
for i in range(len(ESmodels)):
    predicted_values.append(ESmodels[i].predict(start = pd.to_datetime('2016-03-01'), end = pd.to_datetime('2016-12-01'))) # start 날짜부터 end 날짜까지 기간을 예측
    try:
        r2_scores.append(r2_score(test_data, predicted_values[-1]))
    except:
        r2_scores.append(-5)
        continue

In [None]:
predict_index = list(test_data.index)
fig, ax = plt.subplots(figsize=(18,6))
# df.plot(ax=ax);
ax.plot(df.index, df) # train, test가 함께 있는 원본 데이터
ax.vlines(pd.to_datetime('2016-03-01'), 100, 3500, linestyle='--', color='r', label = 'Start of Forecast') # train 데이터와 test 데이터의 경계부분 표시
for i in range(len(predicted_values)):
    ax.plot(predict_index, predicted_values[i], label = 'Prediction') # test부분에 대한 예측결과 표시
# ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 Prediction Interval')
plt.show()

for i in range(len(predicted_values)):
    fig, ax = plt.subplots(figsize=(18,6))
    # df.plot(ax=ax);
    ax.plot(df.index, df)
    ax.vlines(pd.to_datetime('2016-03-01'), 100, 3500, linestyle='--', color='r', label = 'Start of Forecast')

    ax.plot(predict_index, predicted_values[i], label = 'Prediction')
    plt.title('{}, r2_score: {}'.format(titles[i], r2_scores[i]))
    # ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 Prediction Interval')
    plt.show()

 ## 위 데이터 활용해 ARIMA모델의 최적 pdq를 탐색하고, testset에 대한 r2score를 구하고, 예측 plot 그리기

In [None]:
from statsmodels.tsa.arima.model import ARIMA
import itertools
print('Examples of parameter combinations for original ARIMA...')
p=range(0,3)#3
d=range(0,2)#2
q=range(0,3)#3
seasonal = 12
pdq = list(itertools.product(p,d,q))
seasonal_pdq = [(x[0],x[1],x[2],seasonal) for x in list(itertools.product(p,d,q))] # 18
aics=[]
for i in pdq:
    for j in seasonal_pdq:
        try:
            model=ARIMA(train_data.values,order=(i),seasonal_order=(j)) # freq = train_data.index.inferred_freq
            fitted_model = model.fit()
            print('SARIMA: ',i,j,'>> AIC : ',round(fitted_model.aic,2))
            aics.append(fitted_model.aic)
        except:
            aics.append(2000)
            continue

In [None]:
import numpy as np
np.argmin(aics)

k=0
for i in pdq:
    for j in seasonal_pdq:
        if k == np.argmin(aics):
            best_pdq = i
            best_seasonal_pdq = j
            break
        k+=1
    if k == np.argmin(aics):
        break

print(best_pdq, best_seasonal_pdq)

model = ARIMA(np.log(train_data.values), order = best_pdq, seasonal_order=best_seasonal_pdq)
fitted_model = model.fit()

In [None]:
train_data.values

In [None]:
prediction = fitted_model.get_forecast(steps = len(test_data))
prediction

In [None]:
prediction.predicted_mean

In [None]:
prediction.conf_int()

In [None]:
from sklearn.metrics import r2_score
prediction = fitted_model.get_forecast(steps = len(test_data))
predicted_value = prediction.predicted_mean
predicted_value = np.exp(predicted_value)
predicted_ub = prediction.conf_int()[:,0]
predicted_ub = np.exp(predicted_ub)
predicted_lb = prediction.conf_int()[:,1]
predicted_lb = np.exp(predicted_lb)
predict_index = list(test_data.index)
r2 = r2_score(test_data, predicted_value)
r2

fig, ax = plt.subplots(figsize=(18,6))
# df.plot(ax=ax);
ax.plot(df.index, df)
ax.vlines(pd.to_datetime('2016-03-01'), 100, 1500, linestyle='--', color='r', label = 'Start of Forecast')
ax.plot(predict_index, predicted_value, label = 'Prediction')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 Prediction Interval')
plt.show()