In [26]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from keras.models import Sequential
from keras.layers import LSTM,Dense
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from subprocess import check_output


In [15]:
# Data that we use:
# S&P 500 stock data 
# Available here:
# https://www.kaggle.com/camnugent/sandp500
data = pd.read_csv('all_stocks_5yr.csv')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name
0,2012-08-13,92.29,92.59,91.74,92.4,2075391.0,MMM
1,2012-08-14,92.36,92.5,92.01,92.3,1843476.0,MMM
2,2012-08-15,92.0,92.74,91.94,92.54,1983395.0,MMM
3,2012-08-16,92.75,93.87,92.21,93.74,3395145.0,MMM
4,2012-08-17,93.93,94.3,93.59,94.24,3069513.0,MMM


In [16]:
# Here is astatistics description of our dataset
data.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,606417.0,606593.0,606574.0,606801.0,606395.0
mean,79.529041,80.257435,78.799338,79.55792,4500925.0
std,93.383162,94.187977,92.5353,93.382168,9336171.0
min,1.62,1.69,1.5,1.59,0.0
25%,38.07,38.46,37.7,38.09,1077091.0
50%,59.24,59.79,58.69,59.27,2131913.0
75%,89.39,90.15,88.62,89.43,4442768.0
max,2044.0,2067.99,2035.11,2049.0,618237600.0


In [17]:
# Have a check the missing values
data.isnull().sum()

Date        0
Open      384
High      208
Low       227
Close       0
Volume    406
Name        0
dtype: int64

In [18]:
data.loc[data.isnull().sum(axis = 1).astype(bool)].nunique()

Date      411
Open      395
High      565
Low       556
Close     781
Volume    381
Name      382
dtype: int64

In [20]:
data.shape

(606801, 7)

In [21]:
# Compared with the whole dataset, it seems that missing values only
# consist of a small percentage of the whole, so I decide to delete them 
# directly.
clean_data = data.dropna()

In [22]:
clean_data.shape

(606011, 7)

In [23]:
# The dataset contains 503 companies
clean_data['Name'].nunique()

503

In [24]:
# EDA
# Have a look at several companies

In [25]:
Google = clean_data.loc[clean_data['Name'] == 'GOOGL']
Google.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name
32024,2012-08-13,324.03,330.41,323.66,330.34,3268073.0,GOOGL
32025,2012-08-14,329.95,336.76,329.83,334.66,3662178.0,GOOGL
32026,2012-08-15,335.48,337.46,332.38,334.1,2411100.0,GOOGL
32027,2012-08-16,334.09,337.66,333.87,336.77,1717691.0,GOOGL
32028,2012-08-17,337.4,338.96,336.19,338.91,2177896.0,GOOGL


In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(221)
ax1 = sns.lineplot(x="Date", y="Open",
                  markers=True, dashes=False, data=Google)
ax2 = fig.add_subplot(222, sharex=ax1, sharey=ax1)
ax2 = sns.lineplot(x="Date", y="Close",
                  markers=True, dashes=False, data=Google)
ax3 = fig.add_subplot(223, sharex=ax1, sharey=ax1)
ax3 = sns.lineplot(x="Date", y="High",
                  markers=True, dashes=False, data=Google)
ax4 = fig.add_subplot(224, sharex=ax1, sharey=ax1)
ax4 = sns.lineplot(x="Date", y="Low",
                  markers=True, dashes=False, data=Google)

In [None]:
#We will be using the closing data of MMM(want to do something other than AAPL :))
cl = data[data['Name']=='MMM'].Close
scl = MinMaxScaler()
#Scale the data
cl = cl.values.reshape(cl.shape[0],1)
cl = scl.fit_transform(cl)
cl

In [None]:
#Create a function to process the data into 7 day look back slices
def processData(data,lb):
    X,Y = [],[]
    for i in range(len(data)-lb-1):
        X.append(data[i:(i+lb),0])
        Y.append(data[(i+lb),0])
    return np.array(X),np.array(Y)
X,y = processData(cl,7)
X_train,X_test = X[:int(X.shape[0]*0.80)],X[int(X.shape[0]*0.80):]
y_train,y_test = y[:int(y.shape[0]*0.80)],y[int(y.shape[0]*0.80):]
print(X_train.shape[0])
print(X_test.shape[0])
print(y_train.shape[0])
print(y_test.shape[0])

In [None]:
#Build the model
model = Sequential()
model.add(LSTM(256,input_shape=(7,1)))
model.add(Dense(1))
model.compile(optimizer='adam',loss='mse')
#Reshape data for (Sample,Timestep,Features) 
X_train = X_train.reshape((X_train.shape[0],X_train.shape[1],1))
X_test = X_test.reshape((X_test.shape[0],X_test.shape[1],1))
#Fit model with history to check for overfitting
history = model.fit(X_train,y_train,epochs=400,validation_data=(X_test,y_test),shuffle=False)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

In [None]:
X_test[0]

In [None]:
Xt = model.predict(X_test)
plt.plot(scl.inverse_transform(y_test.reshape(-1,1)))
plt.plot(scl.inverse_transform(Xt))

In [None]:
act = []
pred = []
#for i in range(250):
i=249
Xt = model.predict(X_test[i].reshape(1,7,1))
print('predicted:{0}, actual:{1}'.format(scl.inverse_transform(Xt),scl.inverse_transform(y_test[i].reshape(-1,1))))
pred.append(scl.inverse_transform(Xt))
act.append(scl.inverse_transform(y_test[i].reshape(-1,1)))

In [None]:
result_df = pd.DataFrame({'pred':list(np.reshape(pred, (-1))),'act':list(np.reshape(act, (-1)))})

In [None]:
Xt = model.predict(X_test)
plt.plot(scl.inverse_transform(y_test.reshape(-1,1)))
plt.plot(scl.inverse_transform(Xt))