<a href="https://colab.research.google.com/github/robonesky/oreilly/blob/main/RM%20Lab%20Notes%20Advanced%20ML-%20part%20II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install pandas_ta

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
%pip install boruta

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
%pip install keras_tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
# Machine Learning Workshop - I
# Kannan Singaravelu

# basic imports
import os, random
import pandas as pd
import numpy as np
import datetime as dt
import pandas_ta as ta
from pathlib import Path

# import boruta
from boruta import BorutaPy

# warnings
import warnings
warnings.filterwarnings('ignore')

# plotting & outputs
from pprint import pprint
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# functions from helper
# from src.helper import *

# import custom transformer
# from helper import DayTransformer, TimeTransformer

# sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# metrics
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import auc, roc_curve #, plot_roc_curve # plot_confusion_matrix

# import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier

# tensorflow
import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy, Accuracy, AUC, Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.layers import Dropout, Dense, Flatten
from tensorflow.keras.layers import LSTM, BatchNormalization

# kerastuner
import keras_tuner as kt
from kerastuner import HyperParameter, HyperParameters
from kerastuner.tuners import RandomSearch, BayesianOptimization, Hyperband

from sklearn.base import BaseEstimator, TransformerMixin

# Machine Learning Workshop - I by Kannan Singaravelu
# November 2021


In [5]:
# Machine Learning Workshop - I
# Kannan Singaravelu

import pandas as pd
import numpy as np
import random
import tensorflow as tf
from sklearn.base import BaseEstimator, TransformerMixin


# define seed
def set_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)


# create custom day transformer
class DayTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.data = pd.DataFrame(
            {
        'WeekDay': ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
            }
        )
        self.daysnum = np.array(self.data.index+1)

        print(f'self.daysnum is: {self.daysnum}')

        return self


    def transform(self, X): # X is a dataframe
        Xt = X.copy()
        pi = np.pi
        num = Xt.index.weekday+1

        Xt['dsin'] = np.sin(2 * pi * num / np.max(self.daysnum))
        Xt['dcos'] = np.cos(2 * pi * num / np.max(self.daysnum))
        Xt = Xt.drop(['days'], axis=1)

        return Xt


# create custom time transformer
class TimeTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.data = pd.DataFrame(
            {
        'DayParts': ["afternoon","morning","noon"]
            }
        )
        self.timenum = np.array(self.data.index+1)
        return self


    def transform(self, X):
        Xt = X.copy()
        pi = np.pi
        num = Xt.hours.apply(lambda x: 1 if x=='afternoon' else (2 if x=='morning' else 3))
        Xt['tsin'] = np.sin(2 * pi * num / np.max(self.timenum))
        Xt['tcos'] = np.cos(2 * pi * num / np.max(self.timenum))
        Xt = Xt.drop(['hours'], axis=1)

        return Xt


# create function to read locally stored file
def getdata(filename):
    df = pd.read_csv('./'+filename+'.csv')
    df.datetime = pd.to_datetime(df.datetime)
    df = (
        df.set_index('datetime', drop=True)
        .drop('symbol', axis=1)
    )

    # add days
    df['days'] = df.index.day_name()

    # add dayparts
    df['hours'] = df.index.hour
    df['hours'] = df['hours'].apply(daypart)

    return df


# create function to group trade hours
def daypart(hour):
    if hour in [9,10,11]:
        return "morning"
    elif hour in [12,13]:
        return "noon"
    elif hour in [14,15,16,17,18,19]:
        return "afternoon"


# class weight function
def cwts(dfs):
    c0, c1 = np.bincount(dfs)
    w0=(1/c0)*(len(dfs))/2
    w1=(1/c1)*(len(dfs))/2
    return {0: w0, 1: w1}

# Machine Learning Workshop - I by Kannan Singaravelu
# November 2021

In [6]:
# some pandas config
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows',1000)

df = getdata('NFUT1H')
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,days,hours
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-08-16 14:00:00,11410.0,11428.45,11377.0,11402.0,1475100.0,Thursday,afternoon
2018-08-16 15:00:00,11401.5,11420.45,11395.35,11417.0,960075.0,Thursday,afternoon
2018-08-17 09:00:00,11439.75,11475.0,11420.0,11470.2,1789950.0,Friday,morning
2018-08-17 10:00:00,11469.55,11491.65,11462.3,11488.2,1108500.0,Friday,morning
2018-08-17 11:00:00,11488.0,11493.0,11483.5,11487.95,470550.0,Friday,morning


In [7]:
# open minus close, high minus low
df['o2c'] = df['open'] - df['close']
df['h2l'] = df['high'] - df['low']

X = df.drop(['open', 'high', 'low', 'close'], axis=1)

#why on earth we're rearranging columns is anyone's guess...
cols = X.columns
cols = list(cols)

print(f'\nbefore rearranging {cols}')
cols = cols[-2:] + cols[:-2]
print(f'\nafter rearranging {cols}')



before rearranging ['volume', 'days', 'hours', 'o2c', 'h2l']

after rearranging ['o2c', 'h2l', 'volume', 'days', 'hours']


In [8]:
X = X[cols]
X.head()

Unnamed: 0_level_0,o2c,h2l,volume,days,hours
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-08-16 14:00:00,8.0,51.45,1475100.0,Thursday,afternoon
2018-08-16 15:00:00,-15.5,25.1,960075.0,Thursday,afternoon
2018-08-17 09:00:00,-30.45,55.0,1789950.0,Friday,morning
2018-08-17 10:00:00,-18.65,29.35,1108500.0,Friday,morning
2018-08-17 11:00:00,0.05,9.5,470550.0,Friday,morning


In [9]:
# label definition
ret = df['close'].pct_change().fillna(0)
y = np.where(ret.shift(-1) > 0, 1, 0)
y

array([1, 1, 1, ..., 1, 0, 0])

In [10]:
pd.Series(y).value_counts()

1    2638
0    2362
dtype: int64

In [11]:
# first do the day of the week stuff the bad way:

days = pd.get_dummies(df.days)
days

Unnamed: 0_level_0,Friday,Monday,Saturday,Thursday,Tuesday,Wednesday
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-08-16 14:00:00,0,0,0,1,0,0
2018-08-16 15:00:00,0,0,0,1,0,0
2018-08-17 09:00:00,1,0,0,0,0,0
2018-08-17 10:00:00,1,0,0,0,0,0
2018-08-17 11:00:00,1,0,0,0,0,0
...,...,...,...,...,...,...
2021-07-14 11:00:00,0,0,0,0,0,1
2021-07-14 12:00:00,0,0,0,0,0,1
2021-07-14 13:00:00,0,0,0,0,0,1
2021-07-14 14:00:00,0,0,0,0,0,1


In [12]:
hours = pd.get_dummies(df.hours)
hours

Unnamed: 0_level_0,afternoon,morning,noon
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-08-16 14:00:00,1,0,0
2018-08-16 15:00:00,1,0,0
2018-08-17 09:00:00,0,1,0
2018-08-17 10:00:00,0,1,0
2018-08-17 11:00:00,0,1,0
...,...,...,...
2021-07-14 11:00:00,0,1,0
2021-07-14 12:00:00,0,0,1
2021-07-14 13:00:00,0,0,1
2021-07-14 14:00:00,1,0,0


In [13]:
test_hour = pd.Series(['afternoon', 'noon'])
pd.get_dummies(test_hour)

Unnamed: 0,afternoon,noon
0,1,0
1,0,1


In [14]:
# now do it correctly using a OneHotEncoder
encoder = OneHotEncoder(sparse=False)

onehot = encoder.fit_transform(df[['hours']])
onehot

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [15]:
# not sure what precisely this is achieving - be careful
encoder.transform(test_hour[:, np.newaxis])

array([[1., 0., 0.],
       [0., 0., 1.]])

In [16]:
X.head()

Unnamed: 0_level_0,o2c,h2l,volume,days,hours
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-08-16 14:00:00,8.0,51.45,1475100.0,Thursday,afternoon
2018-08-16 15:00:00,-15.5,25.1,960075.0,Thursday,afternoon
2018-08-17 09:00:00,-30.45,55.0,1789950.0,Friday,morning
2018-08-17 10:00:00,-18.65,29.35,1108500.0,Friday,morning
2018-08-17 11:00:00,0.05,9.5,470550.0,Friday,morning


In [17]:
# transform date attributes
dtrans = DayTransformer()

print(f'\nX before fit / transform: \n{X.head()}')
dtrans.fit(X)
dtrans.transform(X)
print(f'\nX after fit / transform: \n{X.head()}')



X before fit / transform: 
                       o2c    h2l     volume      days      hours
datetime                                                         
2018-08-16 14:00:00   8.00  51.45  1475100.0  Thursday  afternoon
2018-08-16 15:00:00 -15.50  25.10   960075.0  Thursday  afternoon
2018-08-17 09:00:00 -30.45  55.00  1789950.0    Friday    morning
2018-08-17 10:00:00 -18.65  29.35  1108500.0    Friday    morning
2018-08-17 11:00:00   0.05   9.50   470550.0    Friday    morning
self.daysnum is: [1 2 3 4 5 6 7]

X after fit / transform: 
                       o2c    h2l     volume      days      hours
datetime                                                         
2018-08-16 14:00:00   8.00  51.45  1475100.0  Thursday  afternoon
2018-08-16 15:00:00 -15.50  25.10   960075.0  Thursday  afternoon
2018-08-17 09:00:00 -30.45  55.00  1789950.0    Friday    morning
2018-08-17 10:00:00 -18.65  29.35  1108500.0    Friday    morning
2018-08-17 11:00:00   0.05   9.50   470550.0    Friday

In [18]:
ttrans = TimeTransformer()

print(f'\nX before fit / transform: \n{X.head()}')
ttrans.fit(X)
ttrans.transform(X)
print(f'\nX after fit / transform: \n{X.head()}')



X before fit / transform: 
                       o2c    h2l     volume      days      hours
datetime                                                         
2018-08-16 14:00:00   8.00  51.45  1475100.0  Thursday  afternoon
2018-08-16 15:00:00 -15.50  25.10   960075.0  Thursday  afternoon
2018-08-17 09:00:00 -30.45  55.00  1789950.0    Friday    morning
2018-08-17 10:00:00 -18.65  29.35  1108500.0    Friday    morning
2018-08-17 11:00:00   0.05   9.50   470550.0    Friday    morning

X after fit / transform: 
                       o2c    h2l     volume      days      hours
datetime                                                         
2018-08-16 14:00:00   8.00  51.45  1475100.0  Thursday  afternoon
2018-08-16 15:00:00 -15.50  25.10   960075.0  Thursday  afternoon
2018-08-17 09:00:00 -30.45  55.00  1789950.0    Friday    morning
2018-08-17 10:00:00 -18.65  29.35  1108500.0    Friday    morning
2018-08-17 11:00:00   0.05   9.50   470550.0    Friday    morning


In [19]:
# it's not immediately obvious to me what this is actually doing.  Review later...

# anyway, start off by splitting the data:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=False)
print(f"Train and Test Size {X_train.shape}, {X_test.shape}")

Train and Test Size (4000, 5), (1000, 5)


In [20]:
print(f'\nX_train before fit_transform: \n{X_train.head()}')


X_train before fit_transform: 
                       o2c    h2l     volume      days      hours
datetime                                                         
2018-08-16 14:00:00   8.00  51.45  1475100.0  Thursday  afternoon
2018-08-16 15:00:00 -15.50  25.10   960075.0  Thursday  afternoon
2018-08-17 09:00:00 -30.45  55.00  1789950.0    Friday    morning
2018-08-17 10:00:00 -18.65  29.35  1108500.0    Friday    morning
2018-08-17 11:00:00   0.05   9.50   470550.0    Friday    morning


In [21]:
# feature scaling:



# combine all transformers
ct = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), [0,1]),
        ('Normal', MinMaxScaler(), [2]),
        ('dtrans', DayTransformer(), [3]),
        ('ttrans', TimeTransformer(), [4])
    ])



pd.DataFrame(ct.fit_transform(X_train))

self.daysnum is: [1 2 3 4 5 6 7]


Unnamed: 0,0,1,2,3,4,5,6
0,0.198737,-0.072891,0.091840,-0.433884,-0.900969,8.660254e-01,-0.5
1,-0.351102,-0.648020,0.051874,-0.433884,-0.900969,8.660254e-01,-0.5
2,-0.700894,0.004593,0.116272,-0.974928,-0.222521,-8.660254e-01,-0.5
3,-0.424804,-0.555257,0.063392,-0.974928,-0.222521,-8.660254e-01,-0.5
4,0.012728,-0.988513,0.013887,-0.974928,-0.222521,-8.660254e-01,-0.5
...,...,...,...,...,...,...,...
3995,0.577776,-0.102357,0.079234,0.974928,-0.222521,-8.660254e-01,-0.5
3996,-0.864676,-0.130732,0.055552,0.974928,-0.222521,-8.660254e-01,-0.5
3997,-0.792144,-0.235499,0.066139,0.974928,-0.222521,-2.449294e-16,1.0
3998,-0.430654,-0.269330,0.070509,0.974928,-0.222521,-2.449294e-16,1.0


In [22]:
print(f'\nX_train after fit_transform: \n{X_train.head()}')


X_train after fit_transform: 
                       o2c    h2l     volume      days      hours
datetime                                                         
2018-08-16 14:00:00   8.00  51.45  1475100.0  Thursday  afternoon
2018-08-16 15:00:00 -15.50  25.10   960075.0  Thursday  afternoon
2018-08-17 09:00:00 -30.45  55.00  1789950.0    Friday    morning
2018-08-17 10:00:00 -18.65  29.35  1108500.0    Friday    morning
2018-08-17 11:00:00   0.05   9.50   470550.0    Friday    morning


In [23]:
pipe = Pipeline(
    [
        ('transformer', ct),
        ('classifier', RandomForestClassifier())
    ]
)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)


self.daysnum is: [1 2 3 4 5 6 7]


In [24]:
# print test data scores
print("Accuracy Score \t\t", accuracy_score(y_test, y_pred).round(4))
print("F1 Score \t\t", f1_score(y_test, y_pred, average='weighted').round(4))
print("Precision Score \t", precision_score(y_test, y_pred, average='weighted').round(4))
print("Recall Score \t\t", recall_score(y_test, y_pred, average='weighted').round(4))

Accuracy Score 		 0.478
F1 Score 		 0.4794
Precision Score 	 0.4836
Recall Score 		 0.478


In [25]:
tscv = TimeSeriesSplit(n_splits=5)

# create numerous (presmably weak) learners:
# specify estimators
dtc = Pipeline([('transformer', ct),('dtc', DecisionTreeClassifier())])
rfc = Pipeline([('transformer', ct), ('rfc', RandomForestClassifier())])
knn = Pipeline([('transformer', ct), ('knn', KNeighborsClassifier())])
gbc = Pipeline([('transformer', ct), ('gbc', GradientBoostingClassifier())])

# get cv score for each
clf = [dtc,rfc,knn,gbc]
for estimator in clf:
    score = cross_val_score(estimator, X_train, y_train, scoring = 'accuracy', cv=tscv, n_jobs=-1)
    print(f"The accuracy score of {estimator} is: {score.mean()}")

# then do stacking with them:
# perform stacking with cv
stack_model = StackingClassifier(estimators = clf, final_estimator = LogisticRegression())

# And see what the resulting score is
score = cross_val_score(stack_model, X_train, y_train, cv = tscv, scoring = 'accuracy')
print(f"The accuracy score of is: {score.mean()}")





The accuracy score of Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  [0, 1]),
                                                 ('Normal', MinMaxScaler(),
                                                  [2]),
                                                 ('dtrans', DayTransformer(),
                                                  [3]),
                                                 ('ttrans', TimeTransformer(),
                                                  [4])])),
                ('dtc', DecisionTreeClassifier())]) is: 0.5
The accuracy score of Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  [0, 1]),
                                                 ('Normal', MinMaxScaler(),
                                                  [2]),
                      

ValueError: ignored