In [1]:
!pip install yfinance  # yahoo finance

Collecting yfinance
  Downloading yfinance-0.1.63.tar.gz (26 kB)
Collecting lxml>=4.5.1
  Downloading lxml-4.6.3-cp37-cp37m-manylinux2014_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 11.1 MB/s 
Building wheels for collected packages: yfinance
  Building wheel for yfinance (setup.py) ... [?25l[?25hdone
  Created wheel for yfinance: filename=yfinance-0.1.63-py2.py3-none-any.whl size=23919 sha256=c74fe1a87af9c78f9434f5c3e9a0daf46c4076a17b1158f96d1651d3a3ad28a3
  Stored in directory: /root/.cache/pip/wheels/fe/87/8b/7ec24486e001d3926537f5f7801f57a74d181be25b11157983
Successfully built yfinance
Installing collected packages: lxml, yfinance
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
Successfully installed lxml-4.6.3 yfinance-0.1.63


In [2]:
import yfinance as yf
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
data = yf.download("AMZN" , start = "2019-01-01" , interval = '1d')

# interval = '1d' will each(1) day data per row

[*********************100%***********************]  1 of 1 completed


In [4]:
data.shape

(645, 6)

In [5]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-02,1465.199951,1553.359985,1460.930054,1539.130005,1539.130005,7983100
2019-01-03,1520.01001,1538.0,1497.109985,1500.280029,1500.280029,6975600
2019-01-04,1530.0,1594.0,1518.310059,1575.390015,1575.390015,9182600
2019-01-07,1602.310059,1634.560059,1589.189941,1629.51001,1629.51001,7993200
2019-01-08,1664.689941,1676.609985,1616.609985,1656.579956,1656.579956,8881400


<h2 style='color:blue'>Understanding Trends with in the Data</h2>

In [6]:
# Sort the data points based on indexes just for confirmation 
data.sort_index(inplace = True)

In [7]:
# Remove any duplicate index 
data = data.loc[~data.index.duplicated(keep='first')]

In [8]:
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-07-19,3532.580078,3550.209961,3499.159912,3549.590088,3549.590088,3784600
2021-07-20,3567.320068,3592.0,3518.0,3573.189941,3573.189941,3251900
2021-07-21,3576.379883,3586.449951,3543.639893,3585.199951,3585.199951,2305400
2021-07-22,3587.22998,3640.02002,3582.27002,3638.030029,3638.030029,3259600
2021-07-23,3640.0,3665.98999,3622.040039,3656.639893,3656.639893,2415951


In [9]:
# Check for missing values 
data.isnull().sum()

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [10]:
# Get the statistics of the data
data.describe()

# The difference between Q3 and Max is very high.
#This means that there are a lot of spikes in volume.

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,645.0,645.0,645.0,645.0,645.0,645.0
mean,2465.29901,2491.890958,2436.675255,2465.084678,2465.084678,4218930.0
std,689.78885,698.284233,678.67883,687.539185,687.539185,1777897.0
min,1465.199951,1538.0,1460.930054,1500.280029,1500.280029,881300.0
25%,1814.630005,1829.469971,1800.790039,1817.459961,1817.459961,2974100.0
50%,2200.469971,2292.0,2186.209961,2283.320068,2283.320068,3759100.0
75%,3181.01001,3208.540039,3135.26001,3175.110107,3175.110107,5056200.0
max,3744.0,3773.080078,3696.790039,3731.409912,3731.409912,15567300.0


In [11]:
import plotly.graph_objects as go

# Check the trend in Closing Values 
fig = go.Figure()

fig.add_trace(go.Scatter(x = data.index , y = data['Close'] , mode = 'lines'))

fig.update_layout(height = 500 , width = 900, xaxis_title='Date' , yaxis_title='Close')
fig.show()

# because of covid it was down for some time (upto 15 march)

# after 15 march stocks increased

In [12]:
# Check the trend in open Traded
fig = go.Figure()

fig.add_trace(go.Scatter(x = data.index , y = data['Open'] , mode = 'lines'))
fig.update_layout(height = 500 , width = 900, 
                  xaxis_title='Date' , yaxis_title='Open')
fig.show()

In [13]:
# Check the trend in Volume Traded
fig = go.Figure()

fig.add_trace(go.Scatter(x = data.index , y = data['Volume'] , mode = 'lines'))
fig.update_layout(height = 500 , width = 900, 
                  xaxis_title='Date' , yaxis_title='Volume')
fig.show()

# A sudden spike determines the sudden interest in stock.

# It could be either in long or short.

# If you keep track of the data point where there is a spike in volume, 
# we would see a sudden jump or dump in the price of the stock value.

# This is a common pattern due to panic selling or panic buying from people when the price is decreasing

<h2 style='color:blue'>Data Preparation</h2>

In [14]:
from sklearn.preprocessing import MinMaxScaler 
import pickle 

# graphical progress bar which can be used to track the progress of our preprocessing.
from tqdm.notebook import tnrange

In [15]:
# Filter only required data 
data = data[['Close' , 'Volume']]
data.head(3)

# We understood that the columns open, close, high and low, They have very similar patterns.

# Thus it is better to only choose one among them.

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-02,1539.130005,7983100
2019-01-03,1500.280029,6975600
2019-01-04,1575.390015,9182600


**volume plays a crucial role in finding an increase or decrease in stock price. We will have to keep volume in our feature vector. So here we are, moving with close and volume as final features.**

As we have collected past three years of data, it is better to keep at least 10 percent or three to four months of recent data as the test set.

In [16]:
# Confirm the Testing Set length 
test_length = data[(data.index >= '2021-03-01')].shape[0]

1. **feature length**, is the number of past data points, the model looks to predict the next data point.

Suppose if the feature length is 13, then we are looking at the close and volume values for the past 30 days to predict the close of the 31st day.

2. Next, we iterate on the indexes of the data and select the close and volume values from current index to feature length. We append these values to feature Vector X, we select the close of feature length as the target and append it to the target Vector Y.

In [17]:
def CreateFeatures_and_Targets(data, feature_length):
    X = []  # features
    Y = []  # targets

    for i in tnrange(len(data) - feature_length): 
        X.append(data.iloc[i : i + feature_length,:].values)
        Y.append(data["Close"].values[i+feature_length])

    X = np.array(X)
    Y = np.array(Y)

    return X , Y

In [18]:
X , Y = CreateFeatures_and_Targets(data , 32)

HBox(children=(FloatProgress(value=0.0, max=613.0), HTML(value='')))




In [19]:
# Check the shapes
X.shape , Y.shape

# our features have 614 records where each record has 32 days of information of two values which are close and volume.

((613, 32, 2), (613,))

In [20]:
Xtrain , Xtest , Ytrain , Ytest = X[:-test_length] , X[-test_length:] , Y[:-test_length] , Y[-test_length:]

# test_length = data[(data.index >= '2021-03-01')].shape[0]

In [21]:
# Check Training Dataset Shape 
Xtrain.shape , Ytrain.shape

((511, 32, 2), (511,))

In [22]:
# Check Testing Dataset Shape
Xtest.shape , Ytest.shape

((102, 32, 2), (102,))

We created a custom class to apply Scaling on multiple dimensions. This is because we have our features in three dimensions and scales provided by sklearn only support Scaling on two dimensions in the constructor.

1. Reinitialize an empty list of scalars, This object keeps track of all scales applied on the data set in the fit transform method.

2. We are looping over the third dimension of our data and at each loop we create a new scalar and fit it over that dimension.

In this process, we also keep collecting the fitted scalers and return the transformed data in the transformed method.

These functions are used to save and load python objects using Pikle

In [23]:
# Create a Scaler to Scale Vectors with Multiple Dimensions 

class MultiDimensionScaler():
    def __init__(self):
        self.scalers = []

    def fit_transform(self , X):
        total_dims = X.shape[2]
        for i in range(total_dims):
            Scaler = MinMaxScaler()
            X[:, :, i] = Scaler.fit_transform(X[:, :, i])
            self.scalers.append(Scaler)
        return X

    def transform(self , X):
        for i in range(X.shape[2]):
            X[:, :, i] = self.scalers[i].transform(X[:,:,i])
        return X 

In [24]:
Feature_Scaler = MultiDimensionScaler()
Xtrain = Feature_Scaler.fit_transform(Xtrain)
Xtest = Feature_Scaler.transform(Xtest)

In [25]:
Target_Scaler = MinMaxScaler()
Ytrain = Target_Scaler.fit_transform(Ytrain.reshape(-1,1))
Ytest = Target_Scaler.transform(Ytest.reshape(-1,1))

In [26]:
def save_object(obj , name : str):
    pickle_out = open(f"{name}.pck","wb")
    pickle.dump(obj, pickle_out)
    pickle_out.close()

def load_object(name : str):
    pickle_in = open(f"{name}.pck","rb")
    data = pickle.load(pickle_in)
    return data

In [27]:
# Save your objects for future purposes 
save_object(Feature_Scaler , "Feature_Scaler")
save_object(Target_Scaler , "Target_Scaler")

<h2 style='color:blue'>Model Building</h2>

In [28]:
from tensorflow.keras.callbacks import ModelCheckpoint , ReduceLROnPlateau

save_best = ModelCheckpoint("best_weights.h5", monitor='val_loss', save_best_only=True, save_weights_only=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.25,patience=4, min_lr=0.00001,verbose = 1)

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Dropout , LSTM , Bidirectional

model = Sequential()

# if you are planning to use to LSTM layer one after the other(means Bidirectional) 
# then you should set the return sequences arguments to True in the first LSTM layer.
# this will return a 3D or sequence output from the LSTM layer.
model.add(Bidirectional(LSTM(512 ,return_sequences=True , recurrent_dropout=0.1, input_shape=(32, 2))))


model.add(LSTM(256 ,recurrent_dropout=0.1))
model.add(Dropout(0.3))
model.add(Dense(64 , activation='elu'))
model.add(Dropout(0.3))
model.add(Dense(32 , activation='elu'))
model.add(Dense(1 , activation='linear'))



1. SGD is cjoosen over Adam in this problem statement is due to the batch size.

2. As we are trying to predict a complex pattern, we need frequent updates. That means the batch size has to be looked at.

3. SGD has perform better compare to Adam un lesser batch size

In [30]:
#optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)
optimizer = tf.keras.optimizers.SGD(learning_rate = 0.002)
model.compile(loss='mse', optimizer=optimizer)

In [31]:
history = model.fit(Xtrain, Ytrain,
            epochs=10,
            batch_size = 1,
            verbose=1,
            shuffle=False ,
            validation_data=(Xtest , Ytest),
            callbacks=[reduce_lr , save_best])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 10/10


In [32]:
# Checking the model Structure 
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (1, 32, 1024)             2109440   
_________________________________________________________________
lstm_1 (LSTM)                (1, 256)                  1311744   
_________________________________________________________________
dropout (Dropout)            (1, 256)                  0         
_________________________________________________________________
dense (Dense)                (1, 64)                   16448     
_________________________________________________________________
dropout_1 (Dropout)          (1, 64)                   0         
_________________________________________________________________
dense_1 (Dense)              (1, 32)                   2080      
_________________________________________________________________
dense_2 (Dense)              (1, 1)                    3

In [33]:
# Load the best weights
model.load_weights("best_weights.h5")

<h2 style='color:blue'>Visualize prediction on Test Set</h2>

In [34]:
Predictions = model.predict(Xtest)

In [35]:
Predictions = Target_Scaler.inverse_transform(Predictions)
Actual = Target_Scaler.inverse_transform(Ytest)

In [37]:
Predictions.shape

(102, 1)

1. Here we see that in inverse transformed objects have two dimensions where the second dimension is one.
2. To convert them into lists, We need to remove the dimension on the first axis.
3. np.squeeze is used to remove any unwanted axis with value one


In [38]:
Predictions = np.squeeze(Predictions , axis = 1)
Actual = np.squeeze(Actual , axis = 1)

In [39]:
# Creating Sample Test Dataframe
test_dataframe_dict = {'Actual' : list(Actual) , 'Predicted' : list(Predictions)}
test_df = pd.DataFrame.from_dict(test_dataframe_dict)

test_df.index = data.index[-test_length:]

In [40]:
test_df.head()

Unnamed: 0_level_0,Actual,Predicted
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-03-01,3146.139893,3253.879883
2021-03-02,3094.530029,3246.326172
2021-03-03,3005.0,3237.349365
2021-03-04,2977.570068,3225.803467
2021-03-05,3000.459961,3213.660645


In [41]:
# Check the trend in Volume Traded
fig = go.Figure()

fig.add_trace(go.Scatter(x = test_df.index , y = Actual , mode = 'lines' , name='Actual'))
fig.add_trace(go.Scatter(x = test_df.index , y = Predictions , mode = 'lines' , name='Predicted'))
fig.show()

<h2 style='color:blue'>Visualize Prediction on whole data</h2>

In [42]:
Total_features = np.concatenate((Xtrain , Xtest) , axis = 0)

In [43]:
Total_Targets = np.concatenate((Ytrain , Ytest) , axis = 0)

In [44]:
Predictions = model.predict(Total_features)

In [45]:
Predictions = Target_Scaler.inverse_transform(Predictions)
Actual = Target_Scaler.inverse_transform(Total_Targets)

In [46]:
Predictions = np.squeeze(Predictions , axis = 1)
Actual = np.squeeze(Actual , axis = 1)

In [47]:
# Check the trend in Volume Traded
fig = go.Figure()

fig.add_trace(go.Scatter(x = data.index , y = Actual , mode = 'lines' , name='Actual'))
fig.add_trace(go.Scatter(x = data.index , y = Predictions , mode = 'lines' , name='Predicted'))
fig.show()

So this looks interesting when we see it that the predictions are quite smooth and they follow the structure of actual values, but they do not seem to capture the granular patterns, maybe we need to add a few more meaningful features that can help models make better predictions.