# Yahoo Stock Price Forecast

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools as itt
import warnings

#from pmdarima.arima import ndiffs,nsdiffs
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn import metrics
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
df = pd.read_csv("/kaggle/input/time-series-forecasting-with-yahoo-stock-price/yahoo_stock.csv")
df.head()

In [None]:
df.index = pd.DatetimeIndex(df.Date, freq='D')
df=df.drop(columns='Date')
df.head()

In [None]:
df.isnull().sum()

Non null values. Dataset complete

## Visualization

In [None]:
plt.figure(figsize=(16,9))

plt.plot(df.index, df.Close, lw=2, color='r',label='Close')
plt.plot(df.index, df.High, lw=1, color='g',linestyle='--',alpha=0.7,label='High')
plt.plot(df.index, df.Low, lw=1, color='m',linestyle='--',alpha=0.7,label='Low')
plt.plot(df.index, df.Open, lw=2, color='b',label='Open')

plt.legend()
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.title('Yahoo Stock Prices')
plt.show()

In [None]:
df_open_month_avg = df['Open'].resample('MS').mean()

plt.figure(figsize=(16,9))

df_open_month_avg.plot(color='b')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.title('Yahoo Stock Prices Monthly Average')

plt.show()

## Search Seasonarity for SARIMA

In [None]:
S = seasonal_decompose(df_open_month_avg,model='add')
S.plot()
plt.show()

In [None]:
def adf_test(timeseries):
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
       dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

print(adf_test(df_open_month_avg))

In [None]:
df_open_t_adj = df_open_month_avg - df_open_month_avg.shift(1)
df_open_t_adj = df_open_t_adj.dropna()
df_open_t_adj.plot()

print(adf_test(df_open_t_adj))

In [None]:
df_open_s_adj = df_open_t_adj - df_open_t_adj.shift(12)
df_open_s_adj = df_open_s_adj.dropna()
df_open_s_adj.plot()
plt.show()

In [None]:
plot_acf(df_open_s_adj)
plt.show()
plot_pacf(df_open_s_adj)
plt.show()

* For ACF plot, initial spikes at lag = 1 and seasonal spikes at lag  = 4 which means a probable AR order of 1 and AR seasonal order of 0 or 1.
* For PACF plot, initial spikes at lag = 1 and seasonal spikes at lag = 4,6 which means a probable MA order of 1 and MA seasonal order od 0 or 1 or 2.

The max values for SARIMA are $(1,1,1)x(1,0,1)_{12}$

Let's GridSearch the best values depending on AIC result

In [None]:
p = range(0, 3)
d = range(1,2)
q = range(0, 3)
pdq = list(itt.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itt.product(p, d, q))]
print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))

params=[]
params_seasonal=[]
AIC = []
for param in pdq:
    for param_seasonal in seasonal_pdq:
         with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            try:
                mod = SARIMAX(df_open_month_avg,
                              order=param,
                              seasonal_order=param_seasonal,
                              enforce_stationarity=False,
                              enforce_invertibility=False)
                results = mod.fit()
                if results.aic > 50:
                    params.append(param)
                    params_seasonal.append(param_seasonal)
                    AIC.append(results.aic)
            
            
            except:
                continue
GS_params = pd.DataFrame(list(zip(params,params_seasonal,AIC)))
Best_params=GS_params.loc[GS_params[2].idxmin()]
print('Best Parameters had been: ARIMA{0}x{1} - AIC:{2}'.format(Best_params[0],Best_params[1],Best_params[2]))

In [None]:
series_to_pred =df_open_month_avg

y_index = series_to_pred.index

date_train = int(len(y_index)*0.9)

y_train = series_to_pred[y_index[:date_train]]
y_test = series_to_pred[y_index[date_train:len(y_index)]]

series_to_pred = series_to_pred.asfreq('MS')
y_train= y_train.asfreq('MS')
y_test= y_test.asfreq('MS')

y_train.tail()

In [None]:
y_test.head()

In [None]:
mod = SARIMAX(y_train,
              order = (0,1,2),
              seasonal_order = (0,1,2,12),
              enforce_stationarity = False,
              enforce_invertibility = False)

results = mod.fit()
results.plot_diagnostics(figsize = (15, 12), lags = 4);

In [None]:
pred_uc = results.get_forecast(steps = len(y_test), dynamic=True)
pred_ci = pred_uc.conf_int()

ax = series_to_pred.plot(label = 'Actual values', figsize = (16, 6))

pred_uc.predicted_mean.plot(ax = ax, label = 'Prediction')

ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color = 'k', alpha = .25)

ax.set_xlabel('Date')
ax.set_ylabel('Dollars')
ax.set_title('Yahoo Stock Price Actual and Predicted')

plt.legend()
plt.show()

In [None]:
y_pred = pred_ci.iloc[:, 0]
y_test=y_test.fillna(np.mean(y_test))
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("The ARIMA model with paremeters {}x{}, has given an rmse in test of {}".format((0,1,2), (0,1,2,12),round(rmse, 2)))

In [None]:
series_to_pred = series_to_pred.asfreq('MS')
mod = SARIMAX(series_to_pred,
              order = (0,1,2),
              seasonal_order = (0,1,2,12),
              enforce_stationarity = False,
              enforce_invertibility = False)

results = mod.fit()

In [None]:
forecast_res = results.get_forecast(steps = 12)
forecast_ci = forecast_res.conf_int()

In [None]:
with plt.style.context("seaborn-darkgrid"):

    ax = series_to_pred.plot(label = 'Actual values', figsize = (16, 6))
    
    forecast_res.predicted_mean.plot(ax = ax, label = 'Forecast')

    ax.fill_between(forecast_ci.index,
                    forecast_ci.iloc[:, 0],
                    forecast_ci.iloc[:, 1], color = 'k', alpha = .25)
    ax.set_xlabel('Date')
    ax.set_ylabel('Dollars')


plt.title("Yahoo Stock Price forecasting")
plt.legend()
plt.show()