In [107]:
# Message Queues

In [108]:
# What is the idea of an MQ
#
# Standard scenario we all know: Send request -> Receive reply
#
# Less so standard scenario
# 
# Send request -> periodically check for a reply for that request
#
# Even more not so standard
#
# Send request -> Read reply at some point from a stream
#

In [109]:
# The specific MQ implementation we will be learning is called ZeroMQ

In [2]:
import zmq
import numpy
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [111]:
#Types of sockets: PUB, SUB, REQ, REP, DEALER, ROUTER

In [112]:
#PUB = publisher, streams from socket
#SUB = subscriber, listens to a publisher
#REQ = Request
#REP = Reply
#DEALER = Non-blocking REQ socket
#ROUTER = you don't need to worry about this one (commonly used for load balancing)

In [2]:
#A simple example
import zmq
# REQ - REP
context = zmq.Context()
rep_socket = context.socket(zmq.REP)
rep_socket.bind("tcp://*:5002")

req_socket = context.socket(zmq.REQ)
req_socket.connect("tcp://localhost:5002")

def sender(socket):
    for i in range(0,5):
        socket.send(str(i))
        print(socket.recv())
def receiver(socket):
    while True:
        print(socket.recv())
        socket.send("Hello")

from threading import Thread

p1 = Thread(target=receiver, args=(rep_socket,))
p1.start()
p2 = Thread(target=sender, args=(req_socket,))
p2.start()
#PUB - SUB

0
Hello
1
Hello
2
Hello
3
Hello
4
Hello


In [1]:
#DEALER - REP
import zmq
context = zmq.Context()
rep_socket = context.socket(zmq.REP)
rep_socket.bind("tcp://*:5003")

req_socket = context.socket(zmq.DEALER)
req_socket.connect("tcp://localhost:5003")

def sender(socket):
    for i in range(0,5):
        socket.send(str(i))
def receiver(socket):
    while True:
        print(socket.recv())
        socket.send("Hello")

from threading import Thread

p1 = Thread(target=receiver, args=(rep_socket,))
p1.start()
p2 = Thread(target=sender, args=(req_socket,))
p2.start()

In [114]:
# Why would you need this.

# Broker's Feed -> ZMQ.PUB -> your application, the broker's functions are abstracted from application logic
# therefore if let's say itInvest goes bankrupt, you won't really have to re-write your code.

In [115]:
# So let's build a working example - let's call this robot (which is really just an application) RandomForest

In [116]:
# Step 1: Train a model on some data

# Step 2: Save the model

# Step 3: Application logic:

# Receive market data -> feed feature vector into model -> receive prediction -> perform trade.

#This to consider:
# 1 - MaxPosition
# 2 - Risk Management (i.e. how much money am I willing to lose today)

# So let's review this:

# RiskManagement Thread (REP)
# Order placement Thread (REP)
# Market Data Thread (PUB)
# Strategy Thread (SUB, REQ)

# Technically we could have unified the Startegy and RiskManagement Threads together, but for partical reasons
# i.e. future code use, I decided it would be best to show you how to do this with separate model.

<img src="media/models.png">

In [117]:
#What is special about this world.

#Well one of these things is that all these should work sort of independently. I.t. the risk manager and the strategy
# thread communicate, but they also work concurrently due to the nature of their tasks.

# The risk manager (RM) will close the position if your losses are triggered for example, while the Strategy thread
# will ask it if it's possible to place an order of a certain size.

# This is why Multi-threading/multi-processing is important - but due to the nature of these tasks, we're better off
# working with MQs rather than Queues, stacks, etc...

In [118]:
#Let's train our model first.

#I proposed a very very primitive idea - 5 classes:
#Strong up (greater than 5), 
#Weak up (greather than 2), 
#Nothing (0), 
#Weak down (less than -2), 
#Strong down (less than -5)

#The data that we have is not THAT useful, so we have to fix it so we have these classes.

#Let's say for the sake of the argument that we are looking at SP500 over itself (sshhh don't tell anyone I have this
# data!!!)

#Step 1 - Features and targets


#For simplicity's sake I want to looke at the deltaPrice, deltaSigmaPrice, and the
#targets are as we defined.

In [124]:
#### Load data
f = open('features.csv','r')
data = f.readlines()
f.close()
#Now let's deal a bit with it.
del data[0]

Xin = []
yin = []
#MAX = 104         -100 -50 0 50 100
#MIN = -113


for entry in data:
    block = entry.split(',')
    k = 0
    v = []
    for k in range(0,len(block)-1):
        v.append(float(block[k]))
    Xin.append(v)
    #And now the rough classification:
    t = float(block[-1].replace("\r\n",""))
    if t > 10:
        yin.append(10)
    elif t > 5 and t < 10:
        yin.append(5)
    elif t < 5 and t > -5:
        yin.append(0)
    elif t > -10 and t < -5:
        yin.append(-5)
    elif t < -10:
        yin.append(-10)

#Let's say we'll take 70% of the data for training, 30% for testing.

print("Data points for training: " + str(int(len(Xin)*0.7)))
print("Data points for testing : " + str(len(Xin)- int(len(Xin)*0.7)))
print("Total data points: " + str(len(Xin)))

X = numpy.asarray(Xin[:1770])
y = numpy.asarray(yin[:1770])

clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=None,min_impurity_split=0,
            min_samples_leaf=5, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=True, random_state=0, verbose=0, warm_start=False)
clf.fit(X,y)
print("Feature information")
print(clf.feature_importances_)

print("Primitive Test")
print("=======================")
print("Predicted=" + str(clf.predict(Xin[1772:1778])) + "| Factual=" + str(yin[1772:1778]))

Data points for training: 1770
Data points for testing : 759
Total data points: 2529
Feature information
[ 0.24237824  0.26254858  0.24954985  0.24552334]
Primitive Test
Predicted=[  0   0   0 -10 -10   0]| Factual=[5, 0, 0, -5, 10, -5]


In [125]:
#Now let's save the model for later re-use
with open('model.bin', 'wb') as output:
    pickle.dump(clf,output,pickle.HIGHEST_PROTOCOL)
    

#Well just for fun let me show you something you could use in your startegy later.
print(clf.predict([Xin[1900]]))
print(yin[1900])

#The point is we can also look at this:
print("[-25,-10,0,10,25]")
print(clf.predict_proba([Xin[1900]]))

#We can also look at the overall precision/error-rate:
y_predicted = clf.predict(Xin[1772:])
good_guesses = 0
bad_guesses = 0

g_array = yin[1771:] - y_predicted

for h in g_array:
    if h == 0: good_guesses = good_guesses + 1
    else: bad_guesses = bad_guesses + 1

print("Precision: " + str(float(good_guesses)/(bad_guesses + good_guesses)))
        
#Okay well it's a shitty model but still..

[10]
10
[-25,-10,0,10,25]
[[ 0.33851552  0.03245904  0.15362436  0.05705652  0.41834457]]
Precision: 0.309114927345


In [132]:
#We can always try a different approach - instead of a classifier we can use regression...
#### Load data
f = open('features.csv','r')
data = f.readlines()
f.close()
#Now let's deal a bit with it.
del data[0]

Xin = []
yin = []
#MAX = 104         -100 -50 0 50 100
#MIN = -113


for entry in data:
    block = entry.split(',')
    k = 0
    v = []
    for k in range(0,len(block)-1):
        v.append(float(block[k]))
    Xin.append(v)
    #And now the rough classification:
    t = float(block[-1].replace("\r\n",""))
    yin.append(t)

#Let's say we'll take 70% of the data for training, 30% for testing.

print("Data points for training: " + str(int(len(Xin)*0.7)))
print("Data points for testing : " + str(len(Xin)- int(len(Xin)*0.7)))
print("Total data points: " + str(len(Xin)))

X = numpy.asarray(Xin[:1770])
y = numpy.asarray(yin[:1770])

clf = RandomForestRegressor(n_estimators=500, criterion='mse', max_depth=None, 
                            min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                            max_features='auto', max_leaf_nodes=None, 
                            min_impurity_split=0.0, bootstrap=True, oob_score=True, n_jobs=1, 
                            random_state=None, verbose=0, warm_start=False)
clf.fit(X,y)
print("Feature information")
print(clf.feature_importances_)

print("Primitive Test")
print("=======================")
print("Predicted=" + str(clf.predict(Xin[1772:1778])) + "| Factual=" + str(yin[1772:1778]))

#With a regression model it becomes very hard to say what is good and what is bad.
# I will explain why.

# For example the model says that the value will be -9, the factual is -12
# That's not necessarily wrong, whereas predicted 0 factual -20 is probably wrong
# This is why because regression and classifiers are slightly different.

Data points for training: 1770
Data points for testing : 759
Total data points: 2529
Feature information
[ 0.22702869  0.29005152  0.24108769  0.24183211]
Primitive Test
Predicted=[-13.90728 -15.84512  -9.99314  -9.42304 -18.21322  -9.73744]| Factual=[5.82, -1.62, -3.12, -6.24, 12.89, -9.61]


In [144]:
#Attempts at improving models.

# A very common problem - trending data.
# THERE ARE MANY WAYS TO DE-TREND (i.e. baseline analysis, randomization, FFT)
# We will look at randomization.

# Different sampling.
#### Load data
f = open('features.csv','r')
data = f.readlines()
f.close()
#Now let's deal a bit with it.
del data[0]

Xin = []
yin = []
#MAX = 104         -100 -50 0 50 100
#MIN = -113


for entry in data:
    block = entry.split(',')
    k = 0
    v = []
    for k in range(0,len(block)-1):
        v.append(float(block[k]))
    Xin.append(v)
    #And now the rough classification:
    t = float(block[-1].replace("\r\n",""))
    if t > 10:
        yin.append(10)
    elif t > 5 and t < 10:
        yin.append(5)
    elif t < 5 and t > -5:
        yin.append(0)
    elif t > -10 and t < -5:
        yin.append(-5)
    elif t < -10:
        yin.append(-10)

#Let's say we'll take 70% of the data for training, 30% for testing.

print("Data points for training: " + str(int(len(Xin)*0.7)))
trainSize = int(len(Xin)*0.7)
print("Data points for testing : " + str(len(Xin)- int(len(Xin)*0.7)))
testSize = len(Xin)- int(len(Xin)*0.7)
print("Total data points: " + str(len(Xin)))

#This is the only part which will change.

Xinprime = []
yinprime = []
indices = []

Xtest = []
ytest = []
#Select some random indexes:

import random
while(len(Xinprime) < trainSize):
    index = random.randint(0, len(yin)-1)
    if index in indices: 
        continue
    else:
        indices.append(index)
        Xinprime.append(Xin[index])
        yinprime.append(yin[index])

        
while(len(Xtest) < testSize/3):
    index = random.randint(0, len(yin)-1)
    if index in indices: 
        continue
    else:
        indices.append(index)
        Xtest.append(Xin[index])
        ytest.append(yin[index])
    
    
X = numpy.asarray(Xinprime)
y = numpy.asarray(yinprime)

clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=None,min_impurity_split=0,
            min_samples_leaf=5, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=True, random_state=0, verbose=0, warm_start=False)
clf.fit(X,y)
print("Feature information")
print("New: " + str(clf.feature_importances_))
#Is this really a boost?
print("Old: [ 0.24237824  0.26254858  0.24954985  0.24552334]")

#print("Primitive Test")
#print("=======================")
#print("Predicted=" + str(clf.predict(numpy.asarray(Xtest))) + "| Factual=" + str(ytest))

good_guesses = 0
bad_guesses = 0

g_array = numpy.asarray(ytest) - clf.predict(numpy.asarray(Xtest))

for h in g_array:
    if h == 0: good_guesses = good_guesses + 1
    else: bad_guesses = bad_guesses + 1

print("Precision: " + str(float(good_guesses)/(bad_guesses + good_guesses)))

#Very shitty.
#The point is, you have to find good drivers.



Data points for training: 1770
Data points for testing : 759
Total data points: 2529
Feature information
New: [ 0.23783342  0.25264858  0.26307523  0.24644277]
Old: [ 0.24237824  0.26254858  0.24954985  0.24552334]
Precision: 0.403162055336


In [106]:
#And now the partical side.
class marketDataReceiver:
    
    marketDataEndpoint = None
    
    def __init__(self,endpoint):
        self.marketDataEndpoint = endpoint
    
    
class robot:
    
    model = None
    
    def __init__(self,clf):
        self.model = clf
        
    def predict(v):
        return clf.predict([numpy.asarray(v)])
    
    
class riskManager:
    
    maxPosition = 2
    currentPosition = 0
    maxLoss = 10000
    blocked = False
    
    def placeMarketOrder(size):
        if(not blocked):
            if (abs(currentPosition + size) <= maxPosition):
                #place the order
                return True
            else:
                return False
        else:
            return False
    
    def checkLoss():
        #To be implemented
        return 0

In [None]:
#And now the coup de grace, putting it all together.
# Problem 1: How do we share MD with robot?
# Option 1: Use a Queue - a little ugly, because if you have multiple robot reading it, it causes problem
# Option 2: Use a PUB/SUB proxy approach
# SUB -> PUB -> SUB 
# Comes with its own drawback i.e. you'll probably have to understand the difference between TCP,UDP,IPC,INPROC,
# PGM transports, measure latencies, and so on.