<a href="https://colab.research.google.com/github/rudoletz/govmomi-examples/blob/master/O'Reilly_Non_linear_ML_Class_Gradient_Boosting_Machines_Classification_and_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Market Data

In [None]:
#Import Python Libraries
import numpy as np
import pandas as pd
from datetime import datetime

import pandas_datareader.data as pdr
import fix_yahoo_finance as yf
yf.pdr_override()

import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [None]:
#Import data
start = datetime(2020, 1, 1)
end = datetime(2020, 8, 4)

stock = pdr.get_data_yahoo('AAPL', start, end) #Apple Inc. stock
market = pdr.get_data_yahoo('SPY', start, end) #S&P 500 index
vix = pdr.get_data_yahoo('^VIX', start, end)   #Volatility index
dxy = pdr.get_data_yahoo('UUP', start, end)    #Dollar index
junk = pdr.get_data_yahoo('JNK', start, end)   #Junk bond index

In [None]:
#Create target dataframe
target = pd.DataFrame()
#Use adjusted closing prices instead of closing prices to adjust for corporate actions such as dividends, splits and mergers
target['return'] = (stock['Open']-stock['Adj Close'].shift(1))/stock['Adj Close'].shift(1) #Returns based on buying on the close the day before and selling on the open the day after
target = target.dropna() #get rid of the NaNs
target['direction'] = np.where(target['return'] > 0, 1, -1) #Overnight direction of the stock
target.head()

Unnamed: 0_level_0,return,direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-03,-0.005622,-1
2020-01-06,-0.007214,-1
2020-01-07,0.005221,1
2020-01-08,0.000944,1
2020-01-09,0.018513,1


In [None]:
#Create features dataframe
features = pd.DataFrame()
features['market'] = market['Adj Close'].pct_change(1)*100
#VIX is volatility index and is measured in percentage terms
features['vix'] = vix['Adj Close'].diff()
features['dxy'] = dxy['Adj Close'].pct_change(1)*100
features['junk'] = junk['Adj Close'].pct_change(1)*100
features = features.dropna()
features.tail()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-07-28,-0.634239,0.700001,0.158044,-0.313307
2020-07-29,1.229873,-1.340001,-0.473373,0.599996
2020-07-30,-0.356793,0.66,-0.396358,0.30295
2020-07-31,0.79022,-0.300001,0.596904,0.141572
2020-08-03,0.695216,-0.179998,0.0,0.014206


In [None]:
lastknown = features[-1:] #Values of features from the last trading session
features = features[:-1] #Subtracts last row from the features matrix so that it aligns with labels vector
features.tail()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-07-27,0.729243,-1.1,-0.745102,0.257001
2020-07-28,-0.634239,0.700001,0.158044,-0.313307
2020-07-29,1.229873,-1.340001,-0.473373,0.599996
2020-07-30,-0.356793,0.66,-0.396358,0.30295
2020-07-31,0.79022,-0.300001,0.596904,0.141572


#Gradient boosting classifier for overnight direction

In [None]:
#Get rid of return column for classifiers
targetclass = target.drop(axis=1, columns='return')
targetclass = targetclass[1:] #Removes the first row of labels since we are correlating today's features with tomorrow's opening values
targetclass.head()                                                                        

Unnamed: 0_level_0,direction
Date,Unnamed: 1_level_1
2020-01-06,-1
2020-01-07,1
2020-01-08,1
2020-01-09,1
2020-01-10,1


In [None]:
#Train and test classifier using Gini impurity performance metric
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

classifier = GradientBoostingClassifier(random_state=1, learning_rate=0.06, max_depth=3)
targetclass = np.ravel(targetclass) 
features_train, features_test, targetclass_train, targetclass_test = train_test_split(features, targetclass, test_size = 0.25, random_state=0)
classifier.fit(features_train, targetclass_train)
print("Training score:", classifier.score(features_train, targetclass_train))
print("Testing score:", classifier.score(features_test, targetclass_test))

Training score: 0.981651376146789
Testing score: 0.6486486486486487


In [None]:
print("Tomorrow's change:", classifier.predict(lastknown))
print("Probability of change", classifier.predict_proba(lastknown))
lastknown

Tomorrow's change: [1]
Probability of change [[0.17930949 0.82069051]]


Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-08-03,0.695216,-0.179998,0.0,0.014206


In [None]:
#Inferring the importance of each feature
print(features.columns)
print(classifier.feature_importances_)

Index(['market', 'vix', 'dxy', 'junk'], dtype='object')
[0.18207493 0.17769463 0.38729637 0.25293408]


In [None]:
from sklearn.metrics import confusion_matrix
targetclass_predict = classifier.predict(features_test)
confusion = confusion_matrix(targetclass_test, targetclass_predict)
confusion

array([[ 7,  7],
       [ 6, 17]])

#Gradient boosting regressor for overnight value changes

In [None]:
#Get rid of direction column for regressors
targetvalue = target.drop(axis=1, columns='direction')
targetvalue = targetvalue[1:] #Removes the first row of labels since we are correlating today's features with tomorrow's opening values
targetvalue.head()                                                                        


Unnamed: 0_level_0,return
Date,Unnamed: 1_level_1
2020-01-06,-0.007214
2020-01-07,0.005221
2020-01-08,0.000944
2020-01-09,0.018513
2020-01-10,0.008235


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

regressor = GradientBoostingRegressor(random_state=1)
targetvalue = np.ravel(targetvalue) #Need to covert column vector into a 1-d array
features_train, features_test, targetvalue_train, targetvalue_test = train_test_split(features, targetvalue, test_size = 0.25, random_state=0)
regressor.fit(features_train, targetvalue_train)
print("Training score:", regressor.score(features_train, targetvalue_train))
print("Testing score:", regressor.score(features_test, targetvalue_test))
print("Tomorrow's value change:", regressor.predict(lastknown))

Training score: 0.9539199341564906
Testing score: 0.06507846029720477
Tomorrow's value change: [0.00513603]


In [None]:
regressor_mae = GradientBoostingRegressor(criterion="mae", random_state=1, learning_rate=0.1, max_depth=5, n_estimators=200, min_samples_split=5, min_samples_leaf=5)
features_train, features_test, targetvalue_train, targetvalue_test = train_test_split(features, targetvalue, test_size = 0.25, random_state=1)
regressor_mae.fit(features_train, targetvalue_train)
print("Training score:", regressor_mae.score(features_train, targetvalue_train))
print("Testing score:", regressor_mae.score(features_test, targetvalue_test))
print("Tomorrow's value change:", regressor_mae.predict(lastknown))

Training score: 0.7293755326183458
Testing score: 0.24468109216042733
Tomorrow's value change: [0.00622702]


In [None]:
#Inferring the importance of each feature
print(features.columns)
print(regressor.feature_importances_)
print(regressor_mae.feature_importances_)

Index(['market', 'vix', 'dxy', 'junk'], dtype='object')
[0.20258626 0.38088449 0.3400491  0.07648015]
[0.24189089 0.22462572 0.23469939 0.29878399]
