Book: Succesfull algorithmic trading

Stany Vanhemelrijck, 17 July 2022

Rewriting code and putting it in functions. 

In [12]:
import datetime
import numpy as np
import pandas as pd
import sklearn

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC, SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

# 1. Loading data

In [13]:
def loading_data(file):
    '''This function opens a csv_file downloaded via the binance api.'''
    # Opening the file. 
    data = pd.read_csv(file)
    # Setting the index
    data = data.set_index('open time')
    return data


In [14]:
my_data = loading_data('ETHBTC-1m-data.csv')
my_data

Unnamed: 0_level_0,open,high,low,close,volume,close time,quote asset volume,number of trades,taker buy base asset volume,taker buy quote asset volume,ignore
open time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-05-17 00:00:00,0.077152,0.077189,0.076957,0.076982,389.7230,2021-05-17 00:00:59.999,30.035967,689,95.5450,7.361695,0
2021-05-17 00:01:00,0.076982,0.077020,0.076933,0.076996,140.6930,2021-05-17 00:01:59.999,10.831091,322,81.2270,6.253738,0
2021-05-17 00:02:00,0.077003,0.077003,0.076794,0.076944,281.7510,2021-05-17 00:02:59.999,21.666062,550,78.8200,6.063116,0
2021-05-17 00:03:00,0.076932,0.076963,0.076812,0.076832,243.7050,2021-05-17 00:03:59.999,18.745779,334,81.1090,6.238376,0
2021-05-17 00:04:00,0.076841,0.076995,0.076760,0.076965,252.3140,2021-05-17 00:04:59.999,19.395651,437,129.5530,9.956246,0
...,...,...,...,...,...,...,...,...,...,...,...
2022-05-18 11:02:00,0.068096,0.068096,0.068056,0.068058,81.4651,2022-05-18 11:02:59.999,5.545412,92,45.3375,3.086229,0
2022-05-18 11:03:00,0.068066,0.068066,0.068028,0.068029,86.7840,2022-05-18 11:03:59.999,5.904328,144,53.7935,3.659696,0
2022-05-18 11:04:00,0.068028,0.068033,0.067967,0.067971,419.9756,2022-05-18 11:04:59.999,28.565168,336,140.4554,9.552474,0
2022-05-18 11:05:00,0.067975,0.067992,0.067956,0.067964,256.8980,2022-05-18 11:05:59.999,17.462647,241,75.7388,5.147991,0


# 2. Creating features (lagged dataframe)

In [4]:
def lagged_dataframe(data, lags):
    '''This function creates the new lagged dataframe 
    and creates the amount of lags prefered. 
    It takes as inputs a DataFrame and a number of lags.'''
    # Creating the new DataFrame.
    tslag = pd.DataFrame(index=data.index)
    # Creating a column for the price.
    tslag["today"] = data["close"]
    # Creating a column for volume.
    tslag["volume"] = data["volume"]
    
    # Create the shifted lag series of prior trading period close values
    lags = lags
    for i in range(0, lags):
        tslag["lag%s" % str(i+1)] = data["close"].shift(i+1)
    
    return tslag

In [5]:
l_data = lagged_dataframe(my_data, 5)

# 3. Creating the returns (with direction) dataframe

In [6]:
def returns_dataframe(tslag, lags):
    '''This function creates the returns dataframe using the lagged dataframe, and
    creates the direction of the movement (up or down), +1 or -1.'''

    # Create the returns DataFrame.
    tsret = pd.DataFrame(index=tslag.index)
    tsret["volume"] = tslag["volume"]
    # Calculates the returns.
    tsret["today"] = tslag["today"].pct_change()*100.0
    
    # If any of the values of percentage returns equal zero, set them to
    # a small number (stops issues with QDA model in Scikit-learn)
    for i,x in enumerate(tsret["today"]):
        if (abs(x) < 0.0001):
            tsret["today"][i] = 0.0001
            
    # Create the lagged percentage returns columns
    for i in range(0, lags):
        tsret["lag%s" % str(i+1)] = \
        tslag["lag%s" % str(i+1)].pct_change()*100.0
        
    # Create the "direction" column (+1 or -1) indicating an up/down day
    tsret["direction"] = np.sign(tsret["today"])
    
    # Dropping the first 5 columns with NaN values.
    tsret.dropna(inplace=True)
    
    return tsret

In [7]:
r_data = returns_dataframe(l_data, 5)

# 4. Splitting the data

In [8]:
def split_train_test(tsret):
    # Use the prior two days of returns as predictor
    # values, with direction as the response.
    X = tsret[["lag1", "lag2"]]
    y = tsret["direction"]
    
    seventy_percent = round(len(tsret) * 0.70)
    
    X_train = X[:seventy_percent]
    X_test = X[seventy_percent:]
    y_train = y[:seventy_percent]
    y_test = y[seventy_percent:]
    
    return X_train, X_test, y_train, y_test


In [9]:
X_train, X_test, y_train, y_test = split_train_test(r_data)

# 5. Running the models

In [10]:
def run_model(X_train, X_test, y_train, y_test):
    print("Hit Rates/ Confusion Matrices:\n")
    models = [("Logistic Regression", LogisticRegression()),
             ]
    # Iterate through the models
    for m in models:
    
        # Train each of the models on the training set
        m[1].fit(X_train, y_train)
    
        # Make an array of predictions on the test set
        pred = m[1].predict(X_test)
    
        # Output the hit-rate and the confusion matrix for each model.
        print("%s:\n Hit rate: %0.3f  \n" % (m[0], m[1].score(X_test, y_test)))
        print("Confusion Matrix: \n%s\n" % confusion_matrix(pred, y_test))
        print("Precision: \n%s\n" % precision_score(y_test, pred))
        print("Recall: \n%s\n" % recall_score(y_test, pred))
        print("F1-score: \n%s\n" % f1_score(y_test, pred))
    
        # Save the predictions with the time index same as y_test
        predicted_signal = pd.DataFrame(pred, index=y_test.index, columns=['signal'])
        predicted_signal.to_csv("./results/results.csv")
    
        print('results saved.')

In [11]:
run_model(X_train, X_test, y_train, y_test)

Hit Rates/ Confusion Matrices:

Logistic Regression:
 Hit rate: 0.515  

Confusion Matrix: 
[[ 6558  6332]
 [70466 74837]]

Precision: 
0.5150409833245012

Recall: 
0.9219899222609617

F1-score: 
0.6608940619590943

results saved.


The high recall says that the model is likely to say that the direction will be up (he does not miss much up positions), but he is only correct 51% of the time (=Precision)

So basically what I found is that a model that does not miss a lot of up movements, can create significant returns. (This is found in the backtesting notebook).