In [1]:
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
import pandas as pd

from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import Adamax

from sklearn.preprocessing import MinMaxScaler



Using TensorFlow backend.


# Don't change these functions

In [2]:
'''
This function loads the data
'''
def loadData(filename):
    rawData = pd.read_csv(filename)
    closePrice = np.array(rawData.loc[:, [' CLOSE']])
    return   closePrice[:,0]

'''
This function extracts X and Y data where X is the price for past n days
n = lookback for this func., Y is price for the next m days.
m = f_horizon for this func
NOTE: WE ARE NOT USING FUNCTION FOR THIS EXERCISE. THIS IS JUST FOR YOUR LEARNING
'''
def create_dataset(closePrice,look_back,f_horizon):    
    dataX = []
    dataY = []        
    for i in range(0,len(closePrice)-look_back-f_horizon,1):
        a = closePrice[i:i+look_back]
        b = closePrice[i+look_back:i+look_back + f_horizon]               
        dataX.append(a.tolist())
        dataY.append(b.tolist())
    return dataX, dataY

'''
This function extracts X and Y data where X is the price for past n days
n = lookback for this func., Y is the labels that can have 3 values -1, 0, 1
-1 denotes a drop in stock price, 0 denotes no change, and 1 denotes an increase in stock price. 
'''
def create_dataset_discrete(closePrice,look_back):    
    dataX = []
    dataY = []
    dataY_raw = []
        
    for i in range(0,len(closePrice)-look_back-1,1):
        a = closePrice[i:i+look_back]
        b = (closePrice[i+look_back] - closePrice[i+look_back-1])/ (closePrice[i+look_back-1]) * 100
        # b is the percentage change in price for the next day
        
        labels = 0
        if b > 0.25:
            labels = 1
        elif b < -0.25:
            labels = -1
        dataX.append(a.tolist())
        dataY.append(labels)
        dataY_raw.append(b)
        
    return dataX, dataY, dataY_raw


# You may want to code some of these functions to prepare your data

In [3]:
# Create Normalized Dataset
def create_normalized_dataset(dataX, dataY):
    # you may want to add your code to normalize the dataset
    scaler = MinMaxScaler()
    scaler.fit(dataX)
    dataX = scaler.transform(dataX)
    return dataX
    

# Create Normalized Dataset
def preprocessData(dataX, dataY):
    # you may want to add your code to preprocess data 
    onehot_encoded = list()
    for i in dataY:
        if i == 0:
            onehot_encoded.append([0,0,1])
        if i == 1:
            onehot_encoded.append([0,1,0])
        if i == -1:
            onehot_encoded.append([1,0,0])
    
    dataX = dataX.reshape(dataX.shape[0],dataX.shape[1],1)
        
    return dataX,onehot_encoded
            
# Create Normalized Dataset
def extractFeatures(dataX, dataY):
    # you may generate some features such as moving averages, Relative strength index etc    
    pass


# This func. create train and test data. Given the entire dataset
# Note that the code doesn't shuffle the data
def createData_TrainTest(dataX, dataY, percent_train_data):
    num_training_data = int (len(dataY)*percent_train_data)
    idx = np.arange(0 , len(dataY))
    #np.random.shuffle(idx)  # Shuffling can provide future info.
    # Extract Test and Train data
    trainX = [dataX[i] for i in idx[0:num_training_data]]
    testX  = [dataX[i] for i in idx[num_training_data:]]
    trainY = [dataY[i] for i in idx[0:num_training_data]]
    testY =  [dataY[i] for i in idx[num_training_data:]]  
    
    return trainX, trainY, testX, testY


# Classifiers

In [27]:
'''
This function is just an example for naive bayes implementation.
It takes the training features and labels as input and learns a
decision tree model using SKLearn's naive bayes algorithm. It runs 10-fold
cross-validation on the training data to identify the best depth.

PLEASE DONT USE THIS FUNCTION. THIS FUNCTION IS BY NO MEAN A GUIDELINE FOR YOUR CODE.
THIS FUNCTION IS ONLY INCLUDED FOR YOUR LEARNING
'''
def learn_naive_bayes(X, y):
    # This list tracks the learned decision tree with the best accuracy
    best_model = [ None, float("-inf") ]
    # Create the object that will split the training set into training and
    # validation sets
    kf = KFold(n_splits=10)
    # Iterate over each of the 10 splits on the data set
    for train, test in kf.split(X):
        # Pull out the features and labels that will be used to train this model
        train_X = [ X[dp] for dp in train ]
        train_y = [ y[dp] for dp in train ]
        # Pull out the features and labels that will be used to validate this
        # model
        valid_X = [ X[dp] for dp in test ]
        valid_y = [ y[dp] for dp in test ]
        # Create the decision tree object
        clf = GaussianNB()
        # Learn the model on the training data that will be used for this
        # fold
        clf = clf.fit(train_X, train_y)
        # Evaluate the learned model on the validation set
        accuracy = clf.score(valid_X, valid_y)
        # Check whether or not this learned model is the most accuracy model
        if accuracy > best_model[1]:
            # Update best_model so that it holds this learned model and its
            # associated accuracy and hyper-parameter information
            best_model = [ clf, accuracy ]
    return best_model


def MLClassifier(trainX,trainY,testX,testY, batch_size, batch_per_step, epochs, training_steps):
    # code as many classifier as you want. You can have as many functions for classifier as you need
    # you can also write Neural network function if you are interested
    def LSTM_keras():

        # LSTM keras model
        model = Sequential()

        # LSTM layer1
        model.add(LSTM(128, return_sequences=True,input_shape=(trainX.shape[1], trainX.shape[2]),dropout=0.2, recurrent_dropout=0.2))
        # model.add(LSTM(128, return_sequences=True,input_shape=(batch_size, trainX.shape[2]),dropout=0.2, recurrent_dropout=0.2))

        # LSTM layer2
        model.add(LSTM(128,return_sequences=True, dropout=0.2, recurrent_dropout=0.2,))

        # LSTM layer3
        model.add(LSTM(256,dropout=0.2, activation='relu' ,recurrent_dropout=0.2))

        # Dense Layer
        model.add(Dense(64,activation='relu'))

        # Final Layer output 3 classes
        model.add(Dense(3,activation='softmax'))

        # optimizer
        adamax = Adamax(lr=0.0035, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0)

        compiled_model = model.compile(optimizer=adamax,loss='categorical_crossentropy',metrics=['accuracy'])

        return compiled_model

    
            
    LSTM_model = LSTM_keras()

    # network parameters
    
    batch_size = batch_size
    batch_per_step = batch_per_step
    epochs = epochs
    training_steps = training_steps
    
    
    print("No. of training Steps required: ",training_steps)

    
    c=0
    
    # train the model
    for i in range(0,trainY.shape[0],batch_size):
        print("\nTraining Step: ", c)
        
        batch_x,batch_y = get_batches(i,batch_size,trainX,trainY)
        LSTM_model.fit(batch_x,batch_y,batch_size=batch_per_step,epochs=epochs)

        c+=1
    
    # returns trained model
    return LSTM_model

# Visualization Tools

In [5]:
def visualize():
    # write your visualization tools here: you can code as many functions as you would like
    pass

def hist_visualize(dataY, x_label_name,y_label_name, figure_title):
    n, bins, patches = plt.hist(x=dataY, bins='auto', color='#0504aa', alpha=0.7, rwidth=0.85)
    plt.grid(axis='y', alpha=0.75)
    plt.xlabel(x_label_name)
    plt.ylabel(y_label_name)
    plt.title(figure_title)
    plt.text(23, 45, r'$\mu=15, b=3$')
    plt.ylim()
    return plt.show()

# helper function: That doesn't fall under any of the above categories

In [6]:
def get_batches(i,batch_size,trainX,trainY):
    batch_x = trainX[i:i+batch_size]
    batch_y = trainY[i:i+batch_size]
    
    return batch_x,batch_y

def to_np_array(trainX, trainY, testX, testY):
    trainX, trainY, testX, testY = np.array(trainX),np.array(trainY),np.array(testX),np.array(testY)
    
    return trainX, trainY, testX, testY

# Main Code Here:

In [7]:
# Input Variable
filename = 'stock_data.txt' # file containing stock data 
look_back = 300  # number of days to lookback to predict the future
percent_train_data = .7 # fraction of training data, .3 = fraction of test data.
                        # we are not dividing into train-test-validation for this problem

closePrice= loadData(filename) # Load closing price of stock from the file. Check below visualization to understand
'''
extracts X and y data,  where X is the price for past n ( = lookback) days
Y is the labels: -1, 0, 1
-1 denotes a drop in stock price, 0 denotes no change, and 1 denotes an increase in stock price. 
'''
dataX, dataY, dataY_raw = create_dataset_discrete(closePrice,look_back) # dataY_raw in not useful for this exercise


In [8]:
# Start writing your code here: Below added code is just a guideline for your understanding

# you may want to code these functions above. You are allowed to use/ change/ reorder below structure
# dataX is a time series data.
extractFeatures(dataX, dataY)  
dataX = create_normalized_dataset(dataX, dataY)
dataX, dataY = preprocessData(dataX, dataY)


trainX, trainY, testX, testY = createData_TrainTest(dataX, dataY, percent_train_data) # split training-testing data
trainX, trainY, testX, testY = to_np_array(trainX, trainY, testX, testY)

# Check the function above. We are not shuffling the data

# Call to ML Classifier in this case Keras LSTM


In [22]:
# batch_per_step can be set to lower value for better accuracy
# epochs can be increased for better generalisatoin
# 
batch_size = 200
batch_per_step = 20
epochs = 2
training_steps = trainY.shape[0]//batch_size

model = MLClassifier(trainX,trainY,testX,testY, batch_size, batch_per_step, epochs, training_steps)

No. of training Steps required:  15

Training Step:  0
Epoch 1/2
Epoch 2/2

Training Step:  1
Epoch 1/2
Epoch 2/2

Training Step:  2
Epoch 1/2
Epoch 2/2

Training Step:  3
Epoch 1/2
Epoch 2/2

Training Step:  4
Epoch 1/2
Epoch 2/2

Training Step:  5
Epoch 1/2
Epoch 2/2

Training Step:  6
Epoch 1/2
Epoch 2/2

Training Step:  7
Epoch 1/2
Epoch 2/2

Training Step:  8
Epoch 1/2
Epoch 2/2

Training Step:  9
Epoch 1/2
Epoch 2/2

Training Step:  10
Epoch 1/2
Epoch 2/2

Training Step:  11
Epoch 1/2
Epoch 2/2

Training Step:  12
Epoch 1/2
Epoch 2/2

Training Step:  13
Epoch 1/2
Epoch 2/2

Training Step:  14
Epoch 1/2
Epoch 2/2

Training Step:  15
Epoch 1/2
Epoch 2/2


In [None]:
# Data Exploration: This shows that the class distribution is balanced and
# we have almost the same number of labels in all the classess
hist_visualize(dataY, 'Class Labels','frequency', 'class_distribution') # class distribution is good


In [None]:
# THESE Three LINES BELOW ARE JUST EXAMPLES/ please delete or comment these lines in your final submission
best_naive_bayes = learn_naive_bayes(trainX, trainY)  
naive_bayes_accuracy = best_naive_bayes[0].score(testX, testY)
naive_bayes_accuracy
#