## Importing basic libraries

In [None]:
#Setup
%matplotlib inline
%config IPCompleter.greedy=True
import time
import os
import psycopg2
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from pythonLib.helper import *
import sqlalchemy
# fix random seed for reproducibility
seed = 7
DATA_DIR = 'data' 
np.random.seed(seed)
dbString = 'postgresql://s2c:JANver95@localhost:5432/stockdata'
engine = sqlalchemy.create_engine(dbString) #

# Load Data into postgres

We need to load the data into a postgres database. First, we go through each file appending the file name as an added column, then we store each file into the database under the HistoricalData table

In [None]:
# # Loads everything into postgres, Uncomment if needed
# # i = 0
# for each_csv in os.listdir(DATA_DIR):
#     i = i+1
#     File = os.path.join(DATA_DIR,each_csv)
#     try:
#         dataInit = readData(File)
#     except:
#         print(each_csv)
#     height = np.shape(dataInit)[0]
#     width = 1
#     tickers = pd.DataFrame(each_csv[:-4], index=range(height), columns=range(width))
#     tickers.columns = ['ticker']
#     dataInit = tickers.join(dataInit)
#     dataInit['datetime'] = dataInit['datetime'].apply(lambda d: str(d))
#     engine = sqlalchemy.create_engine('postgresql://s2c:JANver95@localhost:5432/stockdata')
#     dataInit.to_sql("histdata",engine,index = False,dtype={'datetime':sqlalchemy.TIMESTAMP(timezone=True)},if_exists='append')

  

# Connect to Database, retrieve a dataset

In [3]:
query = "SELECT ticker,avg(volume) FROM histdata GROUP BY ticker ORDER BY avg DESC"
print(query)
stockSet = pd.read_sql(query,engine)

stockSet

# We use this to select DLF

SELECT ticker,avg(volume) FROM histdata GROUP BY ticker ORDER BY avg DESC


Unnamed: 0,ticker,avg
0,DLF,27371.004363
1,FEDERALBNK,20092.896278
2,RELIANCE,19765.357590
3,HINDPETRO,17819.197704
4,JSWSTEEL,17499.262486
5,TV18BRDCST,13962.737137
6,TATAPOWER,11868.993745
7,HFCL,10676.823751
8,RASOYPR,9808.606920
9,JSWENERGY,8716.109570


In [140]:
query = "SELECT * FROM histdata WHERE ticker = 'DLF' or ticker = 'FEDERALBNK' or ticker = 'RELIANCE'"
res = pd.read_sql(query,engine)



## Some Helper Functions

These functions are more or less general functions that should prove to be fairly useful


- **ReadData(filename)** : Reads data from Zerodha API historical data files and returns a Pandas DataFrame
- **sycTimeSeries(ts1,ts2)** : Making sure that 2 timeseries are synced to the smaller time series
- **timeseriesLagged(data, lag=60)**: Creates Lagged series.Goes through a series and generates an lag+1  dimensional   pandas DataFrame that has each previous lag timeunit.
- **binarizeTime(resLagged, rate=0.01)** : Binarizes the last column into 1,-1 or 0 depending whether the price increased, decreased or stayed the same from the beginning to the end of the lag period (triggers on changes by magnitutde = rate*current price).
- **findLag(data, targetCorr,suppressed)** :  Finds the right lag given a target correlation.

# Reading some Data and Getting a feel 

We use an autocorrelation plot to help us figure out what is an optimal amount of lag. We are really looking for a lag that correlates highly. We go through the lags till we reach the last lag that guarantees 0.97 autocorrelation

In [141]:
# # Setup Parameters
dataInit = res # Read the stock price data. This is 1 minute data
data = dataInit['close'] # extract the 'close' column as a Pandas series
# plt.figure()
# pd.tools.plotting.lag_plot(data) # Lag plot to check randomness
# plt.figure()
# pd.tools.plotting.autocorrelation_plot(data) # Auto correlation plot to check if series is autocorrelated at all

# # Find the right lag manually
# targetCorr = 0.99 # autocorrelation we want
# lag = findLag(data,targetCorr,True) # Lag that is indicative 
# if lag == 99: #if lag is 99 then we can just use any number above it as autocorrelation is guaranteed.
#     lag = 120 #nice round 2  hour intervals
# print(lag)
lag = 5
series = timeseriesLagged(data,lag) # Generate the lagged series


In [142]:
# Create binary series where 0 = hold and 1 = buy
buySeries = binarizeTime(series,0)
change = buySeries.iloc[:,-1]== -1 # convert to binary
buySeries.loc[change,str(lag+1)]=0 # convert to binary


In [143]:
# Create binary series where 0 = hold and 1 = sell
sellSeries = binarizeTime(series,0)
change = sellSeries.iloc[:,-1]== 1 # find 1s and convert to 0
sellSeries.loc[change,str(lag+1)]=0 # 
change = sellSeries.iloc[:,-1]== -1 # find -1 and conver to 1s
sellSeries.loc[change,str(lag+1)]= 1 # convert to

In [144]:
print(buySeries.shape) #Total rows 
print(sum(buySeries.iloc[:,-1]==0)) # Number of holds
718929/1194225 # percentage that are holds

(1194340, 6)
719012


0.6020046473654462

In [145]:
print(sellSeries.shape) # Total rows
print(sum(sellSeries.iloc[:,-1]==0)) # Number of sells
726760/1194225 #  % that are holds

(1194340, 6)
726828


0.6085620381418911

# Generate Training Data

Now that we have an idea of what's going on in the dataset, it is a good time to generate training data. We do an 80:20 training:testing split, and then we randomize the training set because we assume that only the last LAG minutes matter

In [146]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout,Conv1D,MaxPooling1D,Flatten,BatchNormalization,LeakyReLU
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.losses import binary_crossentropy
from keras.optimizers import SGD

import h5py

from sklearn.utils import class_weight
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as skp
import tensorflow as tf
import tempfile
tf.__version__

'1.4.1'

In [147]:
# Get values from pandas series as we need a numpy array for our classifier
BuySeriesVals = buySeries.values
trainPercent = 0.8 # first 80% of the data is used for training

#Split into train and test
trainBegin = int(trainPercent*len(BuySeriesVals)) 
trains = BuySeriesVals[0:trainBegin]
train,val = train_test_split(trains)
test = BuySeriesVals[trainBegin:]
np.random.shuffle(train) # shuffle the training dataset

# Split into x and y
xTrain,yTrain = train[:,0:-1],train[:,-1] # X is the first lag elements. Y is the lag+1 element
xVal,yVal = val[:,0:-1],val[:,-1] # Same for Validation
xTest,yTest = test[:,0:-1],test[:,-1] # Same for testing data

#scale function to local normalize each row between 0 and 1 so as to amplify any changes
# a = lambda row: ((row-np.min(row))/(np.max(row)-np.min(row)))
# xTrain = np.apply_along_axis(a,1,xTrain) #scale to 01
# xTest = np.apply_along_axis(a,1,xTest) #scale to 0 1
# xVal = np.apply_along_axis(a,1,xVal) #scale to 0 1

#Reshape for keras
xTrain = xTrain.reshape(xTrain.shape[0], xTrain.shape[1],1)
xTest = xTest.reshape(xTest.shape[0], xTest.shape[1],1)
xVal = xVal.reshape(xVal.shape[0],xVal.shape[1],1)



# # # encode class values as integers
# encoder = LabelEncoder()
# encoder.fit(yTrain)
# encodedyTrain = encoder.transform(yTrain)
# encodedyTest = encoder.transform(yTest)
# encodedyVal = encoder.transform(yVal)
# # convert integers to one hot encoded
# yTrain = np_utils.to_categorical(encodedyTrain)
# yTest = np_utils.to_categorical(encodedyTest)
# yVal = np_utils.to_categorical(encodedyVal)




In [148]:
# Compute Class weights
classWeight = class_weight.compute_class_weight('balanced', np.unique(yTrain), yTrain)
classWeight = dict(enumerate(classWeight))
yTrain

array([ 1.,  0.,  1., ...,  0.,  0.,  1.])

In [149]:
assert xTrain.shape[0] == yTrain.shape[0]
assert xTest.shape[0] == yTest.shape[0]
assert xVal.shape[0] == yTest.shape[0]
yTrain

array([ 1.,  0.,  1., ...,  0.,  0.,  1.])

# ConvNet for Buy

A CNN to predict buy signals from the above generated data

In [159]:
learnRate = 0.001
batchSize = 10
totalBatches = (xTrain.shape[0]//batchSize)
epochs = 5

nClasses = 2
nLength = xTrain.shape[1]
inputShape = (nLength,1)
# xTrainDataSet = tf.data.Dataset.from_tensors(xTrain)
# xTrainIter = xTrainDataSet.make_one_shot_iterator()

In [160]:
# Keras
buyModel = Sequential()
buyModel.add(Conv1D(10,kernel_size= 2, strides=1,
                 input_shape=inputShape,
                 batch_size = None
                   ))
#buyModel.add(BatchNormalization())
buyModel.add(LeakyReLU())
buyModel.add(Dropout(0.5))

buyModel.add(Conv1D(5, kernel_size= 2, strides=1))
#buyModel.add(BatchNormalization())
buyModel.add(LeakyReLU())
buyModel.add(Dropout(0.5))

buyModel.add(Flatten())
buyModel.add(Dense(15))
buyModel.add(BatchNormalization())
buyModel.add(LeakyReLU())

buyModel.add(Dense(1,activation='sigmoid'))

In [161]:
buyModel.summary()
buyModel.compile(loss=binary_crossentropy,
              optimizer=SGD(lr=learnRate),
              metrics=['accuracy'])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_27 (Conv1D)           (None, 4, 10)             30        
_________________________________________________________________
leaky_re_lu_47 (LeakyReLU)   (None, 4, 10)             0         
_________________________________________________________________
dropout_29 (Dropout)         (None, 4, 10)             0         
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 3, 5)              105       
_________________________________________________________________
leaky_re_lu_48 (LeakyReLU)   (None, 3, 5)              0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 3, 5)              0         
_________________________________________________________________
flatten_16 (Flatten)         (None, 15)                0         
__________

In [None]:
buyModel.fit(x=xTrain,
             y=yTrain, 
             class_weight=classWeight,
             epochs = 100)


Epoch 1/100
Epoch 2/100
Epoch 3/100

In [None]:
score = buyModel.evaluate(xTest, yTest, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


In [None]:
score = buyModel.evaluate(xVal, yVal, verbose=0)
print('Val loss:', score[0])
print('Val accuracy:', score[1])

## ConvNet for Buy Didn't work. Trying LSTM for Buy

In [116]:
# Keras
buyModel = Sequential()
buyModel.add(Dense(45,
                 input_shape=inputShape,
                 batch_size = None
                   ))
buyModel.add(BatchNormalization())
buyModel.add(LeakyReLU())
buyModel.add(Dropout(0.5))

buyModel.add(Dense(30))
buyModel.add(BatchNormalization())
buyModel.add(LeakyReLU())
buyModel.add(Dropout(0.5))

buyModel.add(Flatten())
buyModel.add(Dense(15))
buyModel.add(BatchNormalization())
buyModel.add(LeakyReLU())

buyModel.add(Dense(1,activation='sigmoid'))

In [117]:
buyModel.summary()
buyModel.compile(loss=binary_crossentropy,
              optimizer=SGD(lr=learnRate),
              metrics=['accuracy'])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 60, 45)            90        
_________________________________________________________________
batch_normalization_25 (Batc (None, 60, 45)            180       
_________________________________________________________________
leaky_re_lu_35 (LeakyReLU)   (None, 60, 45)            0         
_________________________________________________________________
dropout_21 (Dropout)         (None, 60, 45)            0         
_________________________________________________________________
dense_28 (Dense)             (None, 60, 30)            1380      
_________________________________________________________________
batch_normalization_26 (Batc (None, 60, 30)            120       
_________________________________________________________________
leaky_re_lu_36 (LeakyReLU)   (None, 60, 30)            0         
__________

In [119]:
buyModel.fit(x=xTrain,
             y=yTrain, 
             class_weight=classWeight,
             epochs = 1)
yTrain
score = buyModel.evaluate(xTest, yTest, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

score = buyModel.evaluate(xVal, yVal, verbose=0)
print('Val loss:', score[0])
print('Val accuracy:', score[1])

Epoch 1/1
Test loss: 0.692930755878
Test accuracy: 0.510548876462
Val loss: 0.692818339284
Val accuracy: 0.509812129536
