In [None]:
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd

In [None]:
#importing the training set
dataset_train = pd.read_csv("../input/google-stock-price/Google_Stock_Price_Train.csv")
dataset_train.head()

In [None]:
dataset_train.shape

In [None]:
#Extract the Open Column and convert to an array for forecasting
training_set =  dataset_train.iloc[:,1:2].values #.values converts it to an array

In [None]:
#Feature Scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0,1), copy = True)

training_set_scaled = sc.fit_transform(training_set)

In [None]:
#Number of timesteps
#Create a data structure with 60 timesteps and 1 output 
#Look at the last 60 timesteps (last 60 days) to learn and try to predict the next timestep

x_train = []
y_train = []

for i in range(60,1258):
    x_train.append(training_set_scaled[i-60:i,0])
    y_train.append(training_set_scaled[i,0])

x_train, y_train = np.array(x_train), np.array(y_train)



In [None]:
#Add new dimensions that's needed for your forecast (Reshape function to add a dimension to a numpy array)

x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1)) #Last 1 is the number of predictors which is 1 the open stock price


**Building the RNN**

In [None]:
#import libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout

In [None]:
#Initialize the RNN
regressor = Sequential()

In [None]:
x_train.shape[0], x_train.shape[1] 

In [None]:
#Adding the first LSTM layer and some dropout regularization to avoid overfitting

#Add first LSTM layer
regressor.add(LSTM(units= 50, return_sequences=True, input_shape = (x_train.shape[1],1) )) #No of LSTM cells = units, return_sequences = True because we are building a stacked LSTM which will have several LSTM layers, when you are done adding LSTM layers set it to False whch is the default, input_shape (timesteps,predictors)
#Add dropout regularization
regressor.add(Dropout(rate = 0.2)) #rate of neurons you want to drop during regularization (during each iteration of the forward and back propagation)


In [None]:
#Add Second LSTM Layer and dropout regularization 
regressor.add(LSTM(units= 50, return_sequences=True )) #No of LSTM cells = units, return_sequences = True because we are building a stacked LSTM which will have several LSTM layers, when you are done adding LSTM layers set it to False whch is the default 
#Add dropout regularization
regressor.add(Dropout(rate = 0.2)) #rate of neurons you want to drop during regularization (during each iteration of the forward and back propagation)


In [None]:
#Add Third LSTM Layer and dropout regularization 
regressor.add(LSTM(units= 50, return_sequences=True )) #No of LSTM cells = units, return_sequences = True because we are building a stacked LSTM which will have several LSTM layers, when you are done adding LSTM layers set it to False whch is the default 
#Add dropout regularization
regressor.add(Dropout(rate = 0.2)) #rate of neurons you want to drop during regularization (during each iteration of the forward and back propagation)


In [None]:
#Add Fourth LSTM Layer and dropout regularization 
#return_sequences=False because this is our last LSTM layer
regressor.add(LSTM(units= 50, return_sequences=False )) #No of LSTM cells = units, return_sequences = True because we are done adding LSTM layers 
#Add dropout regularization
regressor.add(Dropout(rate = 0.2)) #rate of neurons you want to drop during regularization (during each iteration of the forward and back propagation)


In [None]:
#Add the output layer for full connection

regressor.add(Dense(units=1 ))

In [None]:
#Compiling the RNN

regressor.compile(optimizer="adam", loss = "mean_squared_error") #optimizer="rmsprop" recommended for RNN but adam is always a safe and good choice
#loss = "mean_squared_error" for regression 

In [None]:
#Fitting the RNN to the Training set

regressor.fit(x_train,y_train, epochs= 100, batch_size=32) #Experimented with 50 and 100 is where there was convergence with the loss meaning the last 20 to 30 epochs, the loss didin't change much


**Making the predictions and visualizing the results**

In [None]:
#Getting the real stock price of 2017

#importing the test set
dataset_test = pd.read_csv("../input/google-stock-price/Google_Stock_Price_Test.csv")

real_stock_price =  dataset_test.iloc[:,1:2].values #.values converts it to an array


In [None]:
#Concatenating the training dataset and test dataset by row to form a total dataset
dataset_total = pd.concat((dataset_train["Open"], dataset_test["Open"]), axis = 0)

#Extract last 60 data from the training dataset + all the data from the test dataset (60 because we trained the model with 60 timesteps)
inputs = dataset_total[len(dataset_total) - len(dataset_test) - 60:].values

inputs = inputs.reshape(-1,1)

#Feature Scaling
inputs = sc.transform(inputs)


In [None]:
inputs.shape

In [None]:
#Reshape the input data into the shape the RNN model was trained on 

x_test = []
for i in range(60,80): #test data contains 80 rows of data
    x_test.append(inputs[i-60:i,0])

x_test  = np.array(x_test) 

x_test = np.reshape(x_test, (x_test.shape[0],x_test.shape[1],1)) #Converts to the 3d format that can go into the RNN as input



In [None]:
predicted_stock_price = regressor.predict(x_test)
predicted_stock_price  = sc.inverse_transform(predicted_stock_price )

In [None]:
plt.plot(predicted_stock_price, color="red", label= "Predicted")
plt.plot(real_stock_price, color="green", label= "Real")
plt.xlabel("Period")
plt.ylabel("Stock Price")
plt.title("Stock Price Forecast")
plt.legend()
plt.show()



In [None]:
#Model Evaluation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


import math 



MAE_RNN = mean_absolute_error(real_stock_price,predicted_stock_price) #8.477951782226564

MSE_RNN = mean_squared_error(real_stock_price,predicted_stock_price) #131.51032080751082

RMSE_RNN = math.sqrt(MSE_RNN) #11.467794940942692

R2_RNN = r2_score(real_stock_price,predicted_stock_price) #0.39490875968951855

real_stock_price_mean = real_stock_price.mean()  #807.5260000000001


print("Mean Absolute Error:", MAE_RNN)
print("Mean Squared Error:", MSE_RNN)
print("Root Mean Squared Error:", RMSE_RNN)
print("R Squared:", R2_RNN)
print("Real Stock Price Mean:", real_stock_price_mean)
print("RMSE_RNN/REAL_STOCK_PRICE_MEAN:", RMSE_RNN/real_stock_price_mean) #0.014201146391500325



**ARIMA**

In [None]:
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd

In [None]:
#importing the training set
df = pd.read_csv("../input/google-stock-price/Google_Stock_Price_Train.csv",index_col= 'Date', parse_dates=True)

df.head()
 

In [None]:
print('Shape of data', df.shape)

In [None]:
#Plot data
df['Open'].plot(figsize=(12,5))

In [None]:
pip install pmdarima

In [None]:
#Get best p,d,q value for ARIMA model

from pmdarima import auto_arima 
#Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")


#The goal is to minimize the AIC and get the best order with the lowest AIC

stepwise_fit = auto_arima(df['Open'], trace=True, suppress_warnings=True)
stepwise_fit.summary()
 

#Best model:  ARIMA(0,1,0)(0,0,0)[0]



In [None]:
print(df.shape)


In [None]:
#257 is 20% of the total rows of data which i want to split into the test sample
from statsmodels.tsa.arima_model import ARIMA
train=df.iloc[:-257]
test=df.iloc[-257:]
print(train.shape, test.shape)

In [None]:
"""
Train the model with the best model parameters gotten earlier
"""

model = ARIMA(train['Open'],order=(0,1,0) )
model = model.fit()
model.summary()


In [None]:
"""
Make Predictions on Test Set
"""

start = len(train)
end = len(train) + len(test) - 1
pred = model.predict(start=start,end=end,typ='levels')
print(pred)

#Add the dates as index from the data frame

pred.index=df.index[start:end+1]
print(pred)


In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


import math 

from sklearn.metrics import mean_squared_error 
from math import sqrt 
 
MAE = mean_absolute_error(test['Open'], pred) #65.16489906613461
MSE = mean_squared_error(test['Open'], pred) #4877.095560904092
RMSE = math.sqrt(MSE) #69.83620522983828
R2 = r2_score(test['Open'], pred) #-3.0842872575999865
real_arima_stock_price_mean = test['Open'].mean()


print("Mean Absolute Error:", MAE)
print("Mean Squared Error:", MSE)
print("Root Mean Squared Error:", RMSE)
print("R Squared:", R2)
print("Real Stock Price Mean:", real_arima_stock_price_mean)
print("RMSE/REAL_STOCK_PRICE_MEAN:", RMSE/real_arima_stock_price_mean) #0.0938522023072058

In [None]:
"""
Now that we know that the model is good, we retrain on the entire data and not just training data
"""

model2 = ARIMA(df['Open'], order=(0,1,0))
model2= model2.fit()
df.tail()
#My data ends on 2016-12-30 

In [None]:
len(df) , len(df)+19

In [None]:
#Last date in total date is 2016-12-30 so we will forecast for the next 20 days to compare results with RNN
pred= model2.predict(start=len(df),end=len(df)+19,typ='levels').rename('ARIMA Predictions')
print(pred)


#Add the dates as index for the predictions
pred.index=dataset_test["Date"].values
print(pred)



In [None]:
#Model Evaluation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


import math 



MAE_AR = mean_absolute_error(real_stock_price,pred) #RNN:8.477951782226564 AR: 21.384797136038213

MSE_AR = mean_squared_error(real_stock_price,pred) #RNN: 131.51032080751082 AR: 615.985950151971

RMSE_AR = math.sqrt(MSE_AR) #RNN: 11.467794940942692 AR: 24.819064248113204

R2_AR = r2_score(real_stock_price,pred) #RNN: 0.39490875968951855 AR: -1.8342087548918768

real_stock_price_mean = real_stock_price.mean()  #RNN: 807.5260000000001 AR: 0.030734693679352987


print("Mean Absolute Error:", MAE_AR)
print("Mean Squared Error:", MSE_AR)
print("Root Mean Squared Error:", RMSE_AR)
print("R Squared:", R2_AR)
print("Real Stock Price Mean:", real_stock_price_mean)
print("RMSE_RNN/REAL_STOCK_PRICE_MEAN:", RMSE_AR/real_stock_price_mean) #RNN: 0.014201146391500325 AR: 0.030734693679352987

**RNN performed a lot better than ARIMA with lower errors.**