# 0.Setup the environment

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Read the training data

In [None]:
df = pd.read_csv("../input/av-hackathon-4/Train_awoL0xl.csv")

In [None]:
#Print sample few rows
df.head()

### We have the stock number and date, we have the open, high, low and close price for each date. An indicator to show if a particular day was holiday or not and an unpredictability score showing how volatile the stock is

# 2. Install package pmdarima

In [None]:
!pip install pmdarima

# 3.Import necessary libraries

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
import numpy as np

# 4. Plot the closing price of Stock 1

In [None]:
df.shape

In [None]:
df2 = df[df.stock == 0]
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Close Prices')
plt.plot(df2['Close'])
plt.title('Stock #1 closing price')
plt.show()

# 5. Test for stationarity

### Stationarity check is important in time series as we need to check what mathematical operations on the series make it predictable for future. For stationary time series the mean and variance is constant, so it is more predictable compared to non stationary time series. One of the test for stationarity is Augmented Dickey Fuller Test (ADF for short)

### Wiki link: https://en.wikipedia.org/wiki/Augmented_Dickey%E2%80%93Fuller_test

Critical values for Dickey–Fuller t-distribution.
Without trend	With trend
Sample size	1%	5%	1%	5%
T = 25	−3.75	−3.00	−4.38	−3.60
T = 50	−3.58	−2.93	−4.15	−3.50
T = 100	−3.51	−2.89	−4.04	−3.45
T = 250	−3.46	−2.88	−3.99	−3.43
T = 500	−3.44	−2.87	−3.98	−3.42
T = ∞	−3.43	−2.86	−3.96	−3.41
Source[2]:373

In [None]:
#Test for stationarity
def test_stationarity(timeseries):
    #Determing rolling statistics
    rolmean = timeseries.rolling(12).mean()
    rolstd = timeseries.rolling(12).std()
    #Plot rolling statistics:
    plt.plot(timeseries, color='blue',label='Original')
    plt.plot(rolmean, color='red', label='Rolling Mean')
    plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean and Standard Deviation')
    plt.show(block=False)
    
    print("Results of Dickey Fuller test")
    adft = adfuller(timeseries,autolag='AIC')
    # output for dft will give us without defining what the values are.
    #hence we manually write what values does it explains using a for loop
    output = pd.Series(adft[0:4],index=['Test Statistics','p-value','No. of lags used','Number of observations used'])
    for key,values in adft[4].items():
        output['critical value (%s)'%key] =  values
    print(output)
    
test_stationarity(df2.Close)

### Based on the high p-value = 0.90 we observe that the time series is non stationary

# 6. Check for seasonal decomposition

In [None]:
result = seasonal_decompose(df2.Close, model='multiplicative', freq = 30)
fig = plt.figure()  
fig = result.plot()  
fig.set_size_inches(16, 9)

### Based on multiplicative model we see that the time series has a strong trend and seasonal component. Residual is mostly flat so most of the variance is explained by trend and seasonality

# 7. Check 12 month moving average

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
df_log = np.log(df2.Close)
moving_avg = df_log.rolling(12).mean()
std_dev = df_log.rolling(12).std()
plt.legend(loc='best')
plt.title('Moving Average')
plt.plot(std_dev, color ="black", label = "Standard Deviation")
plt.plot(moving_avg, color="red", label = "Mean")
plt.legend()
plt.show()

# 8. Split the data in 90% test and 10% training

In [None]:
#split data into train and training set
train_data, test_data = df_log[3:int(len(df_log)*0.9)], df_log[int(len(df_log)*0.9):]
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Closing Prices')
plt.plot(df_log, 'green', label='Train data')
plt.plot(test_data, 'blue', label='Test data')
plt.legend()

# 9. Run Autoarima with exogenous variables

### Autoarima is useful as it selects the best values of p,d,q for time series stationarity through an iterative process. Below is brief explaination of p,d,q values

### 1. p -> is the number of autoregressive terms
### 2. d -> is the number of nonseasonal differences 
### 3. q -> is the number of lagged forecast errors

In [None]:
df3 = df2[["holiday","unpredictability_score"]]
model_autoARIMA = auto_arima(df2.Close, start_p=0, start_q=0,
                      test='adf',       # use adftest to find             optimal 'd'
                      max_p=3, max_q=3, # maximum p and q
                      m=1,              # frequency of series
                      d=None,           # let model determine 'd'
                      seasonal=False,   # No Seasonality
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True,
                      exogenous= df3)
print(model_autoARIMA.summary())

### The best model is ARIMA(1,0,3) which means we are describing stock closing price response variable (Y) by combining a 1st order Auto-Regressive model and a 3rd order Moving Average model.

# 10. Plot the diagnostic metrics for autoarima

In [None]:
model_autoARIMA.plot_diagnostics(figsize=(15,8))
plt.show()

### Based on the above diagnostic metrics we see the model is accurate

### * Residual does not have specific trend
### * Histogram is near normal

# 11. Pick the best model for ARIMAX

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
model= SARIMAX(df2.Close, 
 exog=df3,
 order=(1,0,3),
 enforce_invertibility=False, enforce_stationarity=False)

# 12. Read in the test file

In [None]:
df_test = pd.read_csv("../input/av-hackathon-4/Test_QQKW4dv.csv")
df_test.head()

In [None]:
df_test2 = df_test[df_test.stock == 0]

In [None]:
df_test3 = df_test2[["holiday","unpredictability_score"]]
results= model.fit()
forecast_1= results.forecast(steps=len(df_test2.stock), exog=df_test3)

In [None]:
print(forecast_1)

# 13. Append the forecast with the train set

In [None]:
train_data = df2.Close
test_data = forecast_1
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Closing Prices')
plt.plot(train_data, 'green', label='Train data')
plt.plot(test_data, 'blue', label='Test data')
plt.legend()

### The forecast is linear line and not as good as expected,more work is required to get better forecast, will explore more in next version, stay tuned.