## Importing basic libraries

In [1]:
#Setup
%matplotlib inline
%config IPCompleter.greedy=True
import time
import os
import psycopg2
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from pythonLib.helper import *
import sqlalchemy
# fix random seed for reproducibility
seed = 7
DATA_DIR = 'data' 
np.random.seed(seed)
dbString = 'postgresql://s2c:JANver95@localhost:5432/stockdata'
engine = sqlalchemy.create_engine(dbString) #

# Load Data into postgres

We need to load the data into a postgres database. First, we go through each file appending the file name as an added column, then we store each file into the database under the HistoricalData table

In [2]:
# # Loads everything into postgres, Uncomment if needed
# # i = 0
# for each_csv in os.listdir(DATA_DIR):
#     i = i+1
#     File = os.path.join(DATA_DIR,each_csv)
#     try:
#         dataInit = readData(File)
#     except:
#         print(each_csv)
#     height = np.shape(dataInit)[0]
#     width = 1
#     tickers = pd.DataFrame(each_csv[:-4], index=range(height), columns=range(width))
#     tickers.columns = ['ticker']
#     dataInit = tickers.join(dataInit)
#     dataInit['datetime'] = dataInit['datetime'].apply(lambda d: str(d))
#     engine = sqlalchemy.create_engine('postgresql://s2c:JANver95@localhost:5432/stockdata')
#     dataInit.to_sql("histdata",engine,index = False,dtype={'datetime':sqlalchemy.TIMESTAMP(timezone=True)},if_exists='append')

  

# Connect to Database, retrieve a dataset

In [3]:
query = "SELECT ticker,avg(volume) FROM histdata GROUP BY ticker ORDER BY avg DESC"
print(query)
stockSet = pd.read_sql(query,engine)

stockSet

# We use this to select DLF

SELECT ticker,avg(volume) FROM histdata GROUP BY ticker ORDER BY avg DESC


Unnamed: 0,ticker,avg
0,DLF,27371.004363
1,FEDERALBNK,20092.896278
2,RELIANCE,19765.357590
3,HINDPETRO,17819.197704
4,JSWSTEEL,17499.262486
5,TV18BRDCST,13962.737137
6,TATAPOWER,11868.993745
7,HFCL,10676.823751
8,RASOYPR,9808.606920
9,JSWENERGY,8716.109570


In [4]:
query = "SELECT * FROM histdata WHERE ticker = 'DLF' or ticker = 'FEDERALBNK' or ticker = 'RELIANCE'"
res = pd.read_sql(query,engine)



## Some Helper Functions

These functions are more or less general functions that should prove to be fairly useful


- **ReadData(filename)** : Reads data from Zerodha API historical data files and returns a Pandas DataFrame
- **sycTimeSeries(ts1,ts2)** : Making sure that 2 timeseries are synced to the smaller time series
- **timeseriesLagged(data, lag=60)**: Creates Lagged series.Goes through a series and generates an lag+1  dimensional   pandas DataFrame that has each previous lag timeunit.
- **binarizeTime(resLagged, rate=0.01)** : Binarizes the last column into 1,-1 or 0 depending whether the price increased, decreased or stayed the same from the beginning to the end of the lag period (triggers on changes by magnitutde = rate*current price).
- **findLag(data, targetCorr,suppressed)** :  Finds the right lag given a target correlation.

# Reading some Data and Getting a feel 

We use an autocorrelation plot to help us figure out what is an optimal amount of lag. We are really looking for a lag that correlates highly. We go through the lags till we reach the last lag that guarantees 0.97 autocorrelation

In [5]:
# # Setup Parameters
dataInit = res # Read the stock price data. This is 1 minute data
data = dataInit['close'] # extract the 'close' column as a Pandas series
# plt.figure()
# pd.tools.plotting.lag_plot(data) # Lag plot to check randomness
# plt.figure()
# pd.tools.plotting.autocorrelation_plot(data) # Auto correlation plot to check if series is autocorrelated at all

# # Find the right lag manually
# targetCorr = 0.99 # autocorrelation we want
# lag = findLag(data,targetCorr,True) # Lag that is indicative 
# if lag == 99: #if lag is 99 then we can just use any number above it as autocorrelation is guaranteed.
#     lag = 120 #nice round 2  hour intervals
# print(lag)
lag = 120
series = timeseriesLagged(data,lag) # Generate the lagged series


In [6]:
binSeries = binarizeTime(series,0.000)
change = binSeries.iloc[:,-1]== -1 # convert to binary
binSeries.loc[change,'121']=0 # convert to binary


In [7]:
sum(binSeries.iloc[:,-1] == 1)

475281

In [8]:
718944/(718944+475281)

0.6020172078125982

# Generate Training Data

Now that we have an idea of what's going on in the dataset, it is a good time to generate training data. We do an 80:20 training:testing split, and then we randomize the training set because we assume that only the last LAG minutes matter

In [9]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as skp
import tensorflow as tf
import tempfile
tf.__version__

Using TensorFlow backend.


ImportError: Traceback (most recent call last):
  File "/home/s2c/anaconda3/envs/AlgoTrading/lib/python3.4/site-packages/tensorflow/python/pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "/home/s2c/anaconda3/envs/AlgoTrading/lib/python3.4/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 28, in <module>
    _pywrap_tensorflow_internal = swig_import_helper()
  File "/home/s2c/anaconda3/envs/AlgoTrading/lib/python3.4/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 24, in swig_import_helper
    _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)
  File "/home/s2c/anaconda3/envs/AlgoTrading/lib/python3.4/imp.py", line 243, in load_module
    return load_dynamic(name, filename, file)
ImportError: libcudnn.so.6: cannot open shared object file: No such file or directory


Failed to load the native TensorFlow runtime.

See https://www.tensorflow.org/install/install_sources#common_installation_problems

for some common reasons and solutions.  Include the entire stack trace
above this error message when asking for help.

In [None]:
# Get values from pandas series as we need a numpy array for our classifier
seriesVals = binSeries.values
trainPercent = 0.8 # first 80% of the data is used for training

#Split into train and test
trainBegin = int(trainPercent*len(seriesVals)) 
trains = seriesVals[0:trainBegin]
train,val = train_test_split(trains)
test = seriesVals[trainBegin:]
np.random.shuffle(train) # shuffle the training dataset

# Split into x and y
xTrain,yTrain = train[:,0:-1],train[:,-1] # X is the first lag elements. Y is the lag+1 element
xVal,yVal = val[:,0:-1],val[:,-1] # Same for Validation
xTest,yTest = test[:,0:-1],test[:,-1] # Same for testing data

#scale function
a = lambda row: ((row-np.min(row))/(np.max(row)-np.min(row)))
xTrain = np.apply_along_axis(a,1,xTrain) #scale to 01
xTest = np.apply_along_axis(a,1,xTest) #scale to 0 1
xVal = np.apply_along_axis(a,1,xVal) #scale to 0 1

#Reshape for keras
xTrain = xTrain.reshape(xTrain.shape[0], xTrain.shape[1])
xTest = xTest.reshape(xTest.shape[0], xTest.shape[1])
xVal = xVal.reshape(xVal.shape[0],xVal.shape[1])


# # # encode class values as integers
encoder = LabelEncoder()
encoder.fit(yTrain)
encodedyTrain = encoder.transform(yTrain)
encodedyTest = encoder.transform(yTest)
encodedyVal = encoder.transform(yVal)
# convert integers to dummy variables (i.e. one hot encoded)
yTrain = np_utils.to_categorical(encodedyTrain)
yTest = np_utils.to_categorical(encodedyTest)
yVal = np_utils.to_categorical(encodedyVal)




# Simple Classifier

A simple CNN to see how it works with just basic stuff.

In [None]:
# Fixed
nClasses = 2
n_channels = 1


In [None]:
x = tf.placeholder(tf.float32, shape=[None, xTrain.shape[-1]])
y_ = tf.placeholder(tf.float32, shape=[None, nClasses])

def varGen(shape, bias = False):
    if bias:
        initial = tf.constant(0.1, shape = shape)
    else:
        initial = tf.truncated_normal(shape, stddev = 0.1)
    return tf.Variable(initial) 

def conv2d(x,W):
    return tf.nn.conv2d(x,W,strides = [1, 1, 1, 1], padding = 'SAME')

def meanPool(x):
    return tf.nn.avg_pool(x, ksize=[1, 1, 2, 1], #,pooling_type = "AVG",
                        strides=[1, 1, 2, 1], padding='SAME')

In [None]:
def deepnn(x):
    with tf.name_scope('reshape'):
        xImage = tf.reshape(x, [-1, 1, xTrain.shape[-1], 1])
    
    # convolve layer 1 maps 120 time to 300 features
    with tf.name_scope('conv1'):
        wConv1 = varGen([12, 1, 1, 300])
        bConv1 = varGen([300],bias=True)
        hConv1 = tf.nn.relu(conv2d(xImage, wConv1) + bConv1)
    
    # pool 2x downsample
    with tf.name_scope('pool1'):
        hPool1 = meanPool(hConv1)
        
    # convolve layer 2 maps 60 features to 120 features
    with tf.name_scope('conv2'): 
        wConv2 = varGen([12, 1, 300, 120])
        bConv2 = varGen([120],bias = True)
        hConv2 = tf.nn.relu(conv2d(hPool1, wConv2) + bConv2)
        
    # downsample 2x again
    with tf.name_scope('pool2'):
        hPool2 = meanPool(hConv2)
    
    # convolve layer 3
    with tf.name_scope('conv3'):
        wConv3 = varGen([12, 1, 120, 60])
        bConv3 = varGen([60],bias=True)
        hConv3 = tf.nn.relu(conv2d(hPool2, wConv3) + bConv3)
    
    # pool 2x downsample
    with tf.name_scope('pool3'):
        hPool3 = meanPool(hConv3)
    #print(hPool3) 
    
    # convolve layer 4
    with tf.name_scope('conv4'):
        wConv4 = varGen([12, 1, 60, 60])
        bConv4 = varGen([60],bias=True)
        hConv4 = tf.nn.relu(conv2d(hPool3, wConv4) + bConv4)
    
    # pool 2x downsample
    with tf.name_scope('pool4'):
        hPool4 = meanPool(hConv4)
    #print(hPool4)     
    
    with tf.name_scope('fc1'):
        wFC1 = varGen([8*60,120])
        bFC1 = varGen([120],bias = True)
        hPool4Flat = tf.reshape(hPool4,[-1,8*60])
        hFC1 = tf.nn.relu(tf.matmul(hPool4Flat,wFC1)+bFC1)
        
    with tf.name_scope('dropout'):
        keep_prob = tf.placeholder(tf.float32)
        hFC1Drop = tf.nn.dropout(hFC1, keep_prob)
        
    # output as classes
    with tf.name_scope('fc2'):
        wFC2 = varGen([120,nClasses])
        bFC2 = varGen([nClasses],bias = True)
        
        yConv = tf.matmul(hFC1Drop,wFC2) + bFC2
    return yConv,keep_prob

In [10]:
x = tf.placeholder(tf.float32,[None,xTrain.shape[-1]])
y_ = tf.placeholder(tf.float32,[None,nClasses])
yConv,keep_prob = deepnn(x)


NameError: name 'tf' is not defined

In [58]:
with tf.name_scope('loss'):
    class_weights = tf.constant([[1.4, 1.0]])
    weights = tf.reduce_sum(class_weights*y_,axis=1)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_,
                                                            logits=yConv)
    cross_entropy = tf.reduce_mean(cross_entropy * weights)

with tf.name_scope('adam_optimizer'):
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

with tf.name_scope('accuracy'):
    correct_prediction = tf.equal(tf.argmax(yConv, 1), tf.argmax(y_, 1))
    correct_prediction = tf.cast(correct_prediction, tf.float32)
    
accuracy = tf.reduce_mean(correct_prediction)
saver = tf.train.Saver()



graph_location = tempfile.mkdtemp()
print('Saving graph to: %s' % graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(tf.get_default_graph())



Saving graph to: /tmp/tmpe5h89ans


In [61]:
# Creates a graph.
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
c = tf.matmul(a, b)
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print (sess.run(c))

[[ 22.  28.]
 [ 49.  64.]]


In [59]:
batchSize = 50
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(0,len(xTrain),batchSize):
        xCur = xTrain[i:i+batchSize,:]
        yCur = yTrain[i:i+batchSize]
        if i % (batchSize*10) == 0:
            train_accuracy = accuracy.eval(feed_dict={
                x: xCur, y_: yCur, keep_prob: 1.0})
            print('step %d, training accuracy %g' % (i/(batchSize*10), train_accuracy))
        train_step.run(feed_dict={x: xCur, y_: yCur, keep_prob: .5})

    for i in range(0,len(xTest),batchSize):
        xCur = xTest[i:i+batchSize,:]
        yCur = yTest[i:i+batchSize]
        print('test accuracy %g step %d' % (accuracy.eval(feed_dict={
            x: xTest, y_: yTest,keep_prob: 1.0}), i/(batchsize*10)))         

step 0, training accuracy 0.48
step 1, training accuracy 0.56
step 2, training accuracy 0.58
step 3, training accuracy 0.6
step 4, training accuracy 0.58
step 5, training accuracy 0.56
step 6, training accuracy 0.58
step 7, training accuracy 0.42
step 8, training accuracy 0.58
step 9, training accuracy 0.62
step 10, training accuracy 0.66
step 11, training accuracy 0.46
step 12, training accuracy 0.56
step 13, training accuracy 0.58
step 14, training accuracy 0.62
step 15, training accuracy 0.54
step 16, training accuracy 0.6
step 17, training accuracy 0.56
step 18, training accuracy 0.6
step 19, training accuracy 0.56
step 20, training accuracy 0.66
step 21, training accuracy 0.48
step 22, training accuracy 0.58
step 23, training accuracy 0.58
step 24, training accuracy 0.5
step 25, training accuracy 0.58
step 26, training accuracy 0.68


KeyboardInterrupt: 

In [None]:
range(len(xTrain))

In [None]:
xTrain[0:30,:].shape

In [None]:
y_

In [None]:
yTest == [[0, 1 ,0 ]]

In [43]:
sum(yTrain == [[0,1]])

array([295784, 295784])

In [None]:
29933 + 149196

In [42]:
sum(yTrain == [1, 0])

array([420751, 420751])

In [44]:
295784 +  420751

716535

In [46]:
295784/716535

0.41279770004256594

In [47]:
420751/716535

0.587202299957434

In [48]:
0.587202299957434/0.41279770004256594

1.4224941173288617