In [165]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.graph_objects as go

import keras
import sklearn

from keras.models import Sequential
from keras.layers import Dense, Dropout, CuDNNLSTM, Conv1D
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import hvplot.pandas 


pd.options.plotting.backend = 'holoviews'

In [166]:
print('Numpy version: ' + np.__version__)
print('Pandas version: ' + pd.__version__)
print('Sklearn version: ' + sklearn.__version__)
print('Keras version: ' + keras.__version__)

Numpy version: 1.21.5
Pandas version: 1.3.5
Sklearn version: 1.0.2
Keras version: 2.10.0


In [167]:
# Import the OHLC dataset into a Pandas Dataframe, drop the Volume column
df = pd.read_csv(
    Path("./Resources/SPX.csv"), 
    infer_datetime_format=True,
    index_col=0,
    parse_dates=True,
    header=(1)
)

# Review the DataFrame
display(df.head())
display(df.tail())

Unnamed: 0_level_0,open,high,low,close
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-10-03 09:30:00-04:00,2931.69,2939.86,2931.69,2937.36
2018-10-03 10:30:00-04:00,2937.36,2937.55,2931.77,2935.51
2018-10-03 11:30:00-04:00,2935.51,2937.69,2931.83,2937.1
2018-10-03 12:30:00-04:00,2937.1,2937.34,2934.03,2936.06
2018-10-03 13:30:00-04:00,2936.07,2937.37,2934.4,2935.11


Unnamed: 0_level_0,open,high,low,close
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-10-04 11:30:00-04:00,3786.58,3789.49,3778.01,3786.05
2022-10-04 12:30:00-04:00,3786.09,3788.46,3754.41,3766.3
2022-10-04 13:30:00-04:00,3766.62,3779.76,3764.71,3769.33
2022-10-04 14:30:00-04:00,3769.31,3786.8,3768.4,3779.01
2022-10-04 15:30:00-04:00,3778.92,3791.92,3774.26,3790.92


In [168]:
## check for na's 
df.isnull().sum().sum()
# No Na's

0

In [169]:
# Create plot_candlesticks function, inouts dataframe and the title of desired plot
def plot_candlesticks(dataframe, title):

    candlestick = go.Candlestick(
                            x=dataframe.index,
                            open=dataframe['open'],
                            high=dataframe['high'],
                            low=dataframe['low'],
                            close=dataframe['close']
                            )
    
    fig = go.Figure(data=[candlestick])

    fig.update_layout(
        width=1000, height=500,
        title = title,
        yaxis_title='Price',
    )
        
    fig.update_xaxes(
        rangeslider_visible=False,
        rangebreaks=[
            dict(bounds=["sat", "mon"]),  # hide weekends, eg. hide sat to before mon
            dict(bounds=[16, 9.5], pattern="hour"),  # hide hours outside of 9.30am-4pm
            dict(values=["2019-12-25", "2020-12-24"])  # hide holidays (Christmas and New Year's, etc)
            ])
    
    fig.show()

In [170]:
# Plot original DataFrame
title = 'S&P 500: October 03, 2018 - October 4, 2022'
plot_candlesticks(df, title)

# Part 1: Identify the Candlestick Patterns


Pattern Names           |   Code
   1.  Morning star          : MRNSTR 
   2.  Evening star          : EVNSTR
   3.  Bullish harami        : BLLHRM
   4.  Bearish harami        : BERHRM
   5.  Green hammer            : RDHM
   6.  Red hammer          : GRNHM 
   7.  Bull kicker           : BLLKCK
   8.  Bear kicker           : BERKCK
   9.  Green shooting star   : GRNSSTR
   10. Red shooting star     : RDSSTR 

#### Our first major hurdle with this project was accurately identifying Morning Star and Evening Star patterns, our initial plan was to use TA-Lib and the built in candlestick recognition functions. But after plotting the identified patterns we felt unsatisfied with the results and saught alternative methods.

#### We found a project containing functions to identify the patterns we needed, but we felt the morning star and evening star functions needed some tuning. After tuning those two functions we found all other functions in the referenced project to be accurate and effective for our purposes.
* https://github.com/aliisoli/candlesticks_study/blob/master/Candlesticks_Historical_Analysis.ipynb

# Part 2: Creating DataFrames for each pattern recognition function and plotting to view performance
#### All of the pattern recognition functions identified the final candlestick (row) in the pattern within the DataFrame, so each identified row and the necessary preceding rows were added to a new dataframe in order to plot the entire pattern, but only the candlesticks from our data that were in the specified pattern.

In [171]:
# Find the trends in the data, to be used when identifying patterns
## copy right this repo https://github.com/aliisoli/candlesticks_study/blob/master/Candlesticks_Historical_Analysis.ipynb
def find_trend(data, period:int):
    '''
    Inputs:
    takes in a dataframe and an interger
    Outputs:
    returns True if the trend of the simple moving average over given period is positive, else returns False
    '''
    data['SMA'] = data['close'].rolling(period).mean()
    return (data['SMA']-data['SMA'].shift(1)) > 0
# find the trends 
df['trend'] = find_trend(df, 3)

## Morning Star Method 1
* Using function from: https://github.com/aliisoli/candlesticks_study/blob/master/Candlesticks_Historical_Analysis.ipynb 



In [172]:
# Morning Star function from github repo, with slight alterations to fit scope of this project
def find_morning_star(data):
    '''
    Takes in a dataframe containing closing prices of the stock and returns True where morning star appears'''
    # First candle RED
    MS_cond_1 = data['open'].shift(2) > data['close'].shift(2) 
    # Third candle Green
    MS_cond_2 = data['close'] > data['open']
    # Third candle closes higher than the middle one
    MS_cond_3 = (data['close'] > data['close'].shift(1)) 
    
    MS_cond_4 = data['close'] > (data['open']+data['close'])/2
    MS_cond_5 = data['close'].shift(1) < data['open']
    MS_cond_6 = data['open'].shift(1) < data['open']
    MS_cond_7 = (data['close'].shift(1) < data['close'].shift(2)) & (data['open'].shift(1) < data['close'].shift(2))
    MS_cond_8 = ~ data['trend']

    return MS_cond_1 & MS_cond_2 & MS_cond_3 & MS_cond_4 & MS_cond_5 & MS_cond_6 & MS_cond_7 & MS_cond_8

In [173]:
# Run functions and add results as new columns in df
df['morning_star_1'] = find_morning_star(df)

In [174]:
# View number of morning star patterns identified
df['morning_star_1'].value_counts()

False    7023
True       14
Name: morning_star_1, dtype: int64

In [175]:
# Adding every identified row and the two preceeding rows to a new dataframe
morning_star_1 = df[pd.concat([df.morning_star_1.shift(-i)==True for i in range(3)], axis=1).any(axis=1)]

In [176]:
# Plot morning_star_1
title = 'Morning Star Method 1'
plot_candlesticks(morning_star_1, title)

In [177]:
# Dropping the identified patterns from the method we improved
df = df.drop(columns=['morning_star_1'])

## Evening Star Method 1
* Using function from: https://github.com/aliisoli/candlesticks_study/blob/master/Candlesticks_Historical_Analysis.ipynb 


In [178]:
# Evening Star function from github repo, with slight alterations to fit scope of this project
def find_evening_star(data):
    '''
    Takes in a dataframe containing closing prices of the stock and returns True where evening star appears'''
    # First candle GREEN
    ES_cond_1 = data['close'].shift(2) > data['open'].shift(2)
    ES_cond_2 = data['open'] > data['close'] #next candle RED
    ES_cond_3 = data['close'] < (data['close'].shift(2) + data['open'].shift(2))/2
    ES_cond_4 = data['open'].shift(1) > data['close'].shift(2)
    ES_cond_5 = data['close'].shift(1) > data['close'].shift(2)
    ES_cond_6 = data['open'].shift(1) > data['open']
    ES_cond_7 = data['close'].shift(1) > data['open'] 
    ES_cond_8 = data['trend']
    return ES_cond_1 & ES_cond_2 & ES_cond_3 & ES_cond_4 & ES_cond_5 & ES_cond_6 & ES_cond_7 & ES_cond_8

In [179]:
# Run functions and add results as new columns in df
df['evening_star_1'] = find_evening_star(df)


In [180]:
# View number of evening star patterns identified
df['evening_star_1'].value_counts()

False    7035
True        2
Name: evening_star_1, dtype: int64

In [181]:
# Adding every identified row and the two preceeding rows to a new dataframe
evening_star_1 = df[pd.concat([df.evening_star_1.shift(-i)==True for i in range(3)], axis=1).any(axis=1)]

In [182]:
# Plot evening_star_1
title = 'Evening Star Method 1'
plot_candlesticks(evening_star_1, title)

In [183]:
# Dropping the identified patterns from the method we improved
df = df.drop(columns=['evening_star_1'])

## Morning Star Method 2 : MRNSTR
* Inspired by function from: https://github.com/aliisoli/candlesticks_study/blob/master/Candlesticks_Historical_Analysis.ipynb 
* Code from above GitHub repo was altered to fulfill our standards


In [184]:
# function for finding morning star
def find_morning_star_2(data):
    '''
    Takes in a dataframe containing closing prices of the stock and returns True where morning star appears'''
    # First candle RED
    MS_cond_1 = data['close'].shift(2) < data['open'].shift(2)
    
    #Third candle GREEN
    MS_cond_2 = data['open'] < data['close']
    
    # Second candle INDECISION
    # Middle candle body < one third the size of FIRST candle body
    MS_cond_3 = (data['open'].shift(2) - data['close'].shift(2)) /3 > abs(data['close'].shift(1) - data['open'].shift(1)) 
    
    # Second candle INDECISION
    # Middle candle body < one third the size of LAST candle body
    MS_cond_4 = (data['close'] - data['open']) /3 > abs(data['close'].shift(1) - data['open'].shift(1)) 
    
    # Second candle INDECISION
    # Middle candle close < first candle open
    MS_cond_5 = data['close'].shift(1) < data['open'].shift(2)
    
    # Third candle ENGLULFING
    MS_cond_6 = data['close'] > data['open'].shift(2)

    return MS_cond_1 & MS_cond_2 & MS_cond_3 & MS_cond_4 & MS_cond_5 & MS_cond_6

In [185]:
# Run find_morning_star_2 function and add results as new columns in df
df['morning_star_2'] = find_morning_star_2(df)

In [186]:
# View number of morning star patterns identified
df['morning_star_2'].value_counts()

False    6946
True       91
Name: morning_star_2, dtype: int64

In [187]:
# find the percentage of the dataset that is morning star 
100 * df['morning_star_2'].sum()/df.shape[0]

1.2931647008668468

In [188]:
# Adding every identified row and the two preceeding rows to a new dataframe
morning_star_2 = df[pd.concat([df.morning_star_2.shift(-i)==True for i in range(3)], axis=1).any(axis=1)]

In [189]:
# Plot morning_star_2
title = 'Morning Star Method 2'
plot_candlesticks(morning_star_2, title)

## Evening Star Method 2 : EVNSTR
* Inspired by function from: https://github.com/aliisoli/candlesticks_study/blob/master/Candlesticks_Historical_Analysis.ipynb 
* Code from above GitHub repo was altered to fulfill our standards


In [190]:
# function for finding evening star
def find_evening_star_2(data):
    '''
    Takes in a dataframe containing closing prices of the stock and returns True where evening star appears'''
    # First candle GREEN
    ES_cond_1 = data['close'].shift(2) > data['open'].shift(2)
    
    #Third candle RED
    ES_cond_2 = data['open'] > data['close']
    
    # Second candle INDECISION
    # Middle candle body < one third the size of FIRST candle body
    ES_cond_3 = (data['close'].shift(2) - data['open'].shift(2)) /3 > abs(data['close'].shift(1) - data['open'].shift(1)) 
    
    # Second candle INDECISION
    # Middle candle body < one third the size of LAST candle body
    ES_cond_4 = (data['open'] - data['close']) /3 > abs(data['close'].shift(1) - data['open'].shift(1)) 
    
    # Second candle INDECISION
    # Middle candle close > first candle open
    ES_cond_5 = data['close'].shift(1) > data['open'].shift(2)
    
    # Third candle ENGLULFING
    ES_cond_6 = data['close'] < data['open'].shift(2)

    return ES_cond_1 & ES_cond_2 & ES_cond_3 & ES_cond_4 & ES_cond_5 & ES_cond_6

In [191]:
# Run find_evening_star_2 function and add results as new columns in df
df['evening_star_2'] = find_evening_star_2(df)

In [192]:
# View number of evening star patterns identified
df['evening_star_2'].value_counts()

False    6918
True      119
Name: evening_star_2, dtype: int64

In [193]:
# find the percentage of the dataset that is evening star 
100 * df['evening_star_2'].sum()/df.shape[0]

1.6910615319027995

In [194]:
# Adding every identified row and the two preceeding rows to a new dataframe
evening_star_2 = df[pd.concat([df.evening_star_2.shift(-i)==True for i in range(3)], axis=1).any(axis=1)]

In [195]:
# Plot evening_star_2
title = 'Evening Star Method 2'
plot_candlesticks(evening_star_2, title)

##  Bullish harami        : BLLHRM


In [196]:
# function for finding bullish harami 
def find_bullish_harami(data):
    '''
    Takes in a dataframe containing closing prices of the stock and returns True where bullish harami appears
    '''
    # Opened higher than previous close
    condition_1_BH = data['open'] > data['close'].shift(1) 
    # closed lower than prev open
    condition_2_BH = data['close'] < data['open'].shift(1) 
    # previous candle is red
    condition_3_BH = data['open'].shift(1) > data['close'].shift(1) 
    # the candle is green
    condition_4_BH = data['close'] > data['open'] 
    # must appear in a downtrend
    condition_5_BH = ~ data['trend']
    return condition_1_BH & condition_2_BH & condition_3_BH & condition_4_BH & condition_5_BH

In [197]:
# Run function and add results as new columns in df
df['bullish_harami'] = find_bullish_harami(df)

In [198]:
# View number of patterns identified
df['bullish_harami'].value_counts()

False    6809
True      228
Name: bullish_harami, dtype: int64

In [199]:
# find the percentage of the dataset that is bullish harami 
100 * df['bullish_harami'].sum()/df.shape[0]

3.24001705272133

In [200]:
# Adding every identified row and the one preceeding rows to a new dataframe
bullish_harami = df[pd.concat([df.bullish_harami.shift(-i)==True for i in range(2)], axis=1).any(axis=1)]


In [201]:
# Plot bullish_harami
title = 'Bullish Harami'
plot_candlesticks(bullish_harami, title)

##  Bearish harami        : BERHRM

In [202]:
# function for finding bearish harami 
def find_bearish_harami(data):
    '''
    
    Takes in a dataframe containing closing prices of the stock and returns True where bearish harami appears
    '''
    Bear_Har_cond_1 = data['close'].shift(1) > data['open']
    Bear_Har_cond_2 = data['close'] > data['open'].shift(1)
    Bear_Har_cond_3 = data['close'].shift(1) > data['open'].shift(1)
    Bear_Har_cond_4 = data['open'] > data['close'] 
    Bear_Har_cond_5 = data['trend']
    
    return Bear_Har_cond_1 & Bear_Har_cond_2 & Bear_Har_cond_3 & Bear_Har_cond_4 & Bear_Har_cond_5


In [203]:
# Run function and add results as new columns in df
df['bearish_harami'] = find_bearish_harami(df)

In [204]:
# View number of patterns identified
df['bearish_harami'].value_counts()

False    6823
True      214
Name: bearish_harami, dtype: int64

In [205]:
# find the percentage of the dataset that is bearish harami 
100 * df['bearish_harami'].sum()/df.shape[0]

3.0410686372033537

In [206]:
# Adding every identified row and the one preceeding rows to a new dataframe
bearish_harami = df[pd.concat([df.bearish_harami.shift(-i)==True for i in range(2)], axis=1).any(axis=1)]


In [207]:
# Plot bearish_harami
title = 'Bearish Harami'
plot_candlesticks(bearish_harami, title)

##  Green hammer            : RDHM

In [208]:
# function for finding green hammer 
def find_green_hammer(data):
    '''
    Takes in a dataframe containing closing prices of the stock and returns True where green hammer appears
    '''
    # lower shadow at least twice as long as body
    Gr_Ham_cond_1 = (data['open'] - data['low']) > 2*(data['close']-data['open']) 
    # Upper shadow shorter than a tenth of the body
    Gr_Ham_cond_2 = (data['close']-data['open']) > 10*(data['high'] - data['close'])
    # candle should be green
    Gr_Ham_cond_3 = data['close']>data['open']
    # downtrend
    Gr_Ham_cond_4 = ~ data['trend']
    return Gr_Ham_cond_1 & Gr_Ham_cond_2 & Gr_Ham_cond_3 & Gr_Ham_cond_4

In [209]:
# Run function and add results as new columns in df
df['green_hammer'] = find_green_hammer(df)

In [210]:
# View number of patterns identified
df['green_hammer'].value_counts()

False    7026
True       11
Name: green_hammer, dtype: int64

In [211]:
# find the percentage of the dataset that is green_hammer
100 * df['green_hammer'].sum()/df.shape[0]

0.15631661219269574

In [212]:
# Adding every identified row and the one preceeding rows to a new dataframe
green_hammer = df[pd.concat([df.green_hammer.shift(-i)==True for i in range(1)], axis=1).any(axis=1)]

In [213]:
# Plot green_hammer
title = 'Green Hammer'
plot_candlesticks(green_hammer, title)

##  Red hammer          : GRNHM 

In [214]:
# function for finding red hammer 
def find_red_hammer(data):
    '''
    Takes in a dataframe containing closing prices of the stock and returns True where red hammer appears'''
    # The wick should be at least twice as long as the body
    Rd_Ham_cond_1 = (data['close'] - data['low']) > 2*(data['open']-data['close'])
    # The lower shadow must be very small, at least 10 times smaller than the body
    Rd_Ham_cond_2 = (data['open']-data['close']) > 10*(data['high'] - data['open']) 
    # candle should be bearish
    Rd_Ham_cond_3 = data['open'] > data['close']
    
    Rd_Ham_cond_4 = ~ data['trend']
    return Rd_Ham_cond_1 & Rd_Ham_cond_2 & Rd_Ham_cond_3 & Rd_Ham_cond_4

In [215]:
# Run function and add results as new columns in df
df['red_hammer'] = find_red_hammer(df)

In [216]:
# View number of patterns identified
df['red_hammer'].value_counts()

False    6999
True       38
Name: red_hammer, dtype: int64

In [217]:
# find the percentage of the dataset that is red_hammer
100 * df['red_hammer'].sum()/df.shape[0]

0.5400028421202216

In [218]:
# Adding every identified row and the one preceeding rows to a new dataframe
red_hammer = df[pd.concat([df.red_hammer.shift(-i)==True for i in range(1)], axis=1).any(axis=1)]

In [219]:
# Plot red_hammer
title = 'Red Hammer'
plot_candlesticks(red_hammer, title)

##  Bull kicker           : BLLKCK

In [220]:
# function for finding bull kicker 
def find_bull_kicker(data):
    '''
    Takes in a dataframe containing closing prices of the stock and returns True where bull kicker appears'''
    # the two candles open at the same level (within a small tolerance)
    BK_cond_1 = abs(data['open'].shift(1) - data['open']) < 0.002 * data['open']
    BK_cond_2 = data['open'].shift(1) > data['close'].shift(1) # last candle red
    BK_cond_3 = data['close'] > data['open'] #candle GREEN
    BK_cond_4 = data['low'] > data['open'].shift(1)
    return BK_cond_1 & BK_cond_2 & BK_cond_3 & BK_cond_4

In [221]:
# Run function and add results as new columns in df
df['bull_kicker'] = find_bull_kicker(df)

In [222]:
# View number of patterns identified
df['bull_kicker'].value_counts()

False    7007
True       30
Name: bull_kicker, dtype: int64

In [223]:
# find the percentage of the dataset that is bull_kicker
100 * df['bull_kicker'].sum()/df.shape[0]

0.4263180332528066

In [224]:
# Adding every identified row and the one preceeding rows to a new dataframe
bull_kicker = df[pd.concat([df.bull_kicker.shift(-i)==True for i in range(2)], axis=1).any(axis=1)]


In [225]:
# Plot bull_kicker
title = 'Bull Kicker'
plot_candlesticks(bull_kicker, title)

##  Bear kicker           : BERKCK

In [226]:
# function for finding bear kicker
def find_bear_kicker(data):
    '''
    Takes in a dataframe containing closing prices of the stock and 
    returns True where bear kicker appears
    '''
    # the two candles open at the same level (within a small tolerance)
    BRK_cond_1 = abs(data['open'].shift(1) - data['open']) < 0.002 * data['open']
    # last candle GREEN
    BRK_cond_2 = data['close'].shift(1) > data['open'].shift(1)
    # this candle RED
    BRK_cond_3 = data['open'] > data['close'] 
    BRK_cond_4 = data['open'].shift(1) > data['high']
    return BRK_cond_1 & BRK_cond_2 & BRK_cond_3 & BRK_cond_4


In [227]:
# Run function and add results as new columns in df
df['bear_kicker'] = find_bear_kicker(df)

In [228]:
# View number of patterns identified
df['bear_kicker'].value_counts()

False    7016
True       21
Name: bear_kicker, dtype: int64

In [229]:
# find the percentage of the dataset that is bear_kicker
df['bear_kicker'].sum()/df.shape[0]*100

0.2984226232769646

In [230]:
# Adding every identified row and the one preceeding rows to a new dataframe
bear_kicker = df[pd.concat([df.bear_kicker.shift(-i)==True for i in range(2)], axis=1).any(axis=1)]


In [231]:
# Plot bear_kicker
title = 'Bear Kicker'
plot_candlesticks(bear_kicker, title)

##  Green shooting star   : GRNSSTR

In [232]:
# function for finding green shooting star
def find_green_shooting_star(data):
    '''
    Takes in a dataframe containing closing prices of the stock and returns True where shooting star appears'''
    SHS_1 =(data['high'] - data['close']) > 2 * (data['close'] - data['open'])
    SHS_2 =(data['close'] - data['open']) > 10*(data['open'] - data['low'])
    SHS_3 = data['close'] > data['open']
    SHS_4 = data['trend'] 
    return SHS_1 & SHS_2 & SHS_3 & SHS_4


In [233]:
# Run function and add results as new columns in df
df['green_shooting_star'] = find_green_shooting_star(df)

In [234]:
# View number of patterns identified
df['green_shooting_star'].value_counts()

False    7014
True       23
Name: green_shooting_star, dtype: int64

In [235]:
# find the percentage of the dataset that is green_shooting_star
df['green_shooting_star'].sum()/df.shape[0]*100

0.3268438254938184

In [236]:
# Adding every identified row and the one preceeding rows to a new dataframe
green_shooting_star = df[pd.concat([df.green_shooting_star.shift(-i)==True for i in range(1)], axis=1).any(axis=1)]


In [237]:
# Plot green_shooting_star
title = 'Green Shooting Star'
plot_candlesticks(green_shooting_star, title)

##  Red shooting star     : RDSSTR 

In [238]:
# red shooting star
def find_red_shooting_star(data):
    '''
    Takes in a dataframe containing closing prices of the stock and returns True where shooting star appears'''
    RSH_1 =(data['high'] - data['open']) > 2 * (data['open'] - data['close'])
    RSH_2 =(data['open'] - data['close']) > 10*(data['close'] - data['low'])
    RSH_3 = data['open'] > data['close'] #is Red
    RSH_4 = data['trend']
    return RSH_1 & RSH_2 & RSH_3 & RSH_4


In [239]:
df['red_shooting_star'] = find_red_shooting_star(df)

In [240]:
df['red_shooting_star'].value_counts()

False    7020
True       17
Name: red_shooting_star, dtype: int64

In [241]:
df['red_shooting_star'].sum()/df.shape[0]*100

0.24158021884325706

In [242]:
# Adding every identified row and the one preceeding rows to a new dataframe
red_shooting_star = df[pd.concat([df.red_shooting_star.shift(-i)==True for i in range(1)], axis=1).any(axis=1)]


In [243]:
# Plot Red_Shooting_Star
title = 'Red Shooting Star'
plot_candlesticks(red_shooting_star, title)

# Data Analysis

In [244]:
# df columns 
df.columns

Index(['open', 'high', 'low', 'close', 'SMA', 'trend', 'morning_star_2',
       'evening_star_2', 'bullish_harami', 'bearish_harami', 'green_hammer',
       'red_hammer', 'bull_kicker', 'bear_kicker', 'green_shooting_star',
       'red_shooting_star'],
      dtype='object')

In [245]:
# Display the data types for our dataframe
df.dtypes

open                   float64
high                   float64
low                    float64
close                  float64
SMA                    float64
trend                     bool
morning_star_2            bool
evening_star_2            bool
bullish_harami            bool
bearish_harami            bool
green_hammer              bool
red_hammer                bool
bull_kicker               bool
bear_kicker               bool
green_shooting_star       bool
red_shooting_star         bool
dtype: object

In [246]:
# Preview first 5 rows and last 5 rows of our dataframe
df.head()

Unnamed: 0_level_0,open,high,low,close,SMA,trend,morning_star_2,evening_star_2,bullish_harami,bearish_harami,green_hammer,red_hammer,bull_kicker,bear_kicker,green_shooting_star,red_shooting_star
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-10-03 09:30:00-04:00,2931.69,2939.86,2931.69,2937.36,,False,False,False,False,False,False,False,False,False,False,False
2018-10-03 10:30:00-04:00,2937.36,2937.55,2931.77,2935.51,,False,False,False,False,False,False,False,False,False,False,False
2018-10-03 11:30:00-04:00,2935.51,2937.69,2931.83,2937.1,2936.656667,False,False,False,False,False,False,False,False,False,False,False
2018-10-03 12:30:00-04:00,2937.1,2937.34,2934.03,2936.06,2936.223333,False,False,False,False,False,False,False,False,False,False,False
2018-10-03 13:30:00-04:00,2936.07,2937.37,2934.4,2935.11,2936.09,False,False,False,False,False,False,False,False,False,False,False


In [247]:
# Drop the two NA values created by the 3 period SMA
df = df.dropna()

In [248]:
# Rename the morning star and evening star columns, we used our modified version of the function (method 2)
df = df.rename(columns={'morning_star_2':'morning_star', 'evening_star_2':'evening_star'})

In [249]:
# Create one column which identifies the row a pattern is completed, for all patterns

# Bullish Harami
df['candlesticks'] = np.where((df['bullish_harami'] == True), 'BLLHRM', False)

# Bearish Harami
df['candlesticks'] = np.where((df['bearish_harami'] == True), \
                               'BERHRM', df['candlesticks'])
## Green Hammer 
df['candlesticks'] = np.where((df['green_hammer'] == True), \
                               'GRNHM', df['candlesticks'])
## Red Hammer 
df['candlesticks'] = np.where((df['red_hammer'] == True), \
                               'RDHM', df['candlesticks'])
## Morning Star
df['candlesticks'] = np.where((df['morning_star'] == True), \
                               'MRNSTR', df['candlesticks'])
## Evening Star
df['candlesticks'] = np.where((df['evening_star'] == True), \
                               'EVNSTR', df['candlesticks'])
##  Bull Kicker
df['candlesticks'] = np.where((df['bull_kicker'] == True), \
                               'BLLKCK', df['candlesticks'])
##  Bear Kicker
df['candlesticks'] = np.where((df['bear_kicker'] == True), \
                               'BERKCK', df['candlesticks'])
##  Green Shooting Star
df['candlesticks'] = np.where((df['green_shooting_star'] == True), \
                               'GRNSSTR', df['candlesticks'])
##  Red Shooting Star
df['candlesticks'] = np.where((df['red_shooting_star'] == True), \
                               'RDSSTR', df['candlesticks'])

In [250]:
## analyze the new column
df['candlesticks'].value_counts()

False      6253
BLLHRM      226
BERHRM      212
EVNSTR      116
MRNSTR       90
RDHM         37
BLLKCK       29
GRNSSTR      23
BERKCK       21
RDSSTR       17
GRNHM        11
Name: candlesticks, dtype: int64

In [251]:
# Plot the number of times each pattern appears in our price history data
df['candlesticks'][df['candlesticks'] != 'False'].value_counts().plot(kind='bar')

In [252]:
df.head()

Unnamed: 0_level_0,open,high,low,close,SMA,trend,morning_star,evening_star,bullish_harami,bearish_harami,green_hammer,red_hammer,bull_kicker,bear_kicker,green_shooting_star,red_shooting_star,candlesticks
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2018-10-03 11:30:00-04:00,2935.51,2937.69,2931.83,2937.1,2936.656667,False,False,False,False,False,False,False,False,False,False,False,False
2018-10-03 12:30:00-04:00,2937.1,2937.34,2934.03,2936.06,2936.223333,False,False,False,False,False,False,False,False,False,False,False,False
2018-10-03 13:30:00-04:00,2936.07,2937.37,2934.4,2935.11,2936.09,False,False,False,False,False,False,False,False,False,False,False,False
2018-10-03 14:30:00-04:00,2935.11,2935.16,2921.36,2923.74,2931.636667,False,False,False,False,False,False,False,False,False,False,False,False
2018-10-03 15:30:00-04:00,2923.76,2927.33,2923.54,2925.51,2928.12,False,False,False,True,False,False,False,False,False,False,False,BLLHRM


In [253]:
# see column names 
df.columns

Index(['open', 'high', 'low', 'close', 'SMA', 'trend', 'morning_star',
       'evening_star', 'bullish_harami', 'bearish_harami', 'green_hammer',
       'red_hammer', 'bull_kicker', 'bear_kicker', 'green_shooting_star',
       'red_shooting_star', 'candlesticks'],
      dtype='object')

In [254]:
## see a summary of the df
df.describe()

Unnamed: 0,open,high,low,close,SMA
count,7035.0,7035.0,7035.0,7035.0,7035.0
mean,3555.508196,3563.324218,3547.14743,3555.533863,3555.412921
std,668.121884,668.684404,667.607706,668.138261,668.080532
min,2208.87,2245.88,2191.86,2208.92,2230.323333
25%,2933.645,2938.53,2929.58,2933.845,2933.68
50%,3420.72,3425.55,3413.13,3421.07,3417.863333
75%,4183.775,4191.17,4175.545,4182.995,4184.67
max,4813.26,4818.62,4805.64,4813.24,4802.396667


In [255]:
# Save csv file of price history with all identified patterns
df.to_csv('data/all_candles.csv')

# Machine Learning

The Machine Learning Process will include the following steps
 -  Drop Columns
 -  Drop Na's and Null's 
 -  Convert the target (categorical data) column to numeric
 -  Scale the feature columns
 -  Split the df to training and testing dataset
 -  Create a Machine learning object
 -  Train the model
 -  Test the model
 -  Evaluate the model


In [443]:
# make a copy of the df 

df_copy = df.copy()

In [291]:
# drop unwanted columns 
# drop SMA and trend columns
df.drop(columns=['SMA', 'trend', 'morning_star', 'evening_star', \
       'bullish_harami', 'bearish_harami', 'green_hammer', 'red_hammer', \
       'bull_kicker', 'bear_kicker', 'green_shooting_star', \
       'red_shooting_star'], inplace=True)

In [292]:
# verify that the columns are dropped 
df.columns

Index(['open', 'high', 'low', 'close', 'candlesticks'], dtype='object')

In [293]:
# drop duplicate rows

df = df.drop_duplicates()

In [294]:
df['candlesticks'].value_counts()

False      6253
BLLHRM      226
BERHRM      212
EVNSTR      116
MRNSTR       90
RDHM         37
BLLKCK       29
GRNSSTR      23
BERKCK       21
RDSSTR       17
GRNHM        11
Name: candlesticks, dtype: int64

In [295]:
# drop rows where candlesticks in False 
df = df[df['candlesticks'] != 'False']

In [296]:
# drop Na's 
df = df.dropna()

In [297]:
# drop index 
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,open,high,low,close,candlesticks
0,2923.76,2927.33,2923.54,2925.51,BLLHRM
1,2919.35,2919.78,2901.65,2903.85,BERKCK
2,2904.34,2904.47,2887.77,2888.55,EVNSTR
3,2876.5,2876.6,2862.08,2872.17,RDHM
4,2884.69,2889.45,2884.43,2884.43,RDSSTR


In [298]:
## summary of current data 
# we have 782 rows and 5 columns
# about 11% of the original dataset
df.shape

(782, 5)

In [299]:
# create feature and target dataset
# X is the features 
# y is target dataset

X = df.drop(columns='candlesticks')
y = df['candlesticks'].values


In [300]:
display(X.head(2))
display(y[:5])

Unnamed: 0,open,high,low,close
0,2923.76,2927.33,2923.54,2925.51
1,2919.35,2919.78,2901.65,2903.85


array(['BLLHRM', 'BERKCK', 'EVNSTR', 'RDHM', 'RDSSTR'], dtype=object)

In [301]:
# encode the target column (candelstick)
from sklearn.preprocessing import LabelEncoder
# make a label encoder object
le = LabelEncoder()
y = le.fit_transform(y)
y[:10]

array([2, 1, 4, 8, 9, 0, 6, 5, 0, 2])

In [302]:
# split the data to training and testing set
#  stratifying the target value y, so that every candlestick labelencoded is represented in 
# the training and testing dataset 
# spliting the dataset 80% training and 20% testing 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,  stratify=y, test_size=0.2, random_state=0)

In [303]:
X_train.head()

Unnamed: 0,open,high,low,close
405,3513.65,3514.49,3509.61,3511.62
325,2963.48,2970.5,2960.24,2964.47
207,2997.21,2998.0,2995.92,2997.39
259,3294.75,3296.29,3288.25,3289.84
584,4415.9,4452.75,4414.47,4449.96


In [304]:
X_test.head()

Unnamed: 0,open,high,low,close
106,2816.97,2823.28,2811.74,2811.76
211,2985.73,2985.74,2971.96,2976.32
406,3506.7,3521.58,3505.5,3509.44
731,3854.66,3866.65,3853.11,3862.55
184,2929.52,2929.52,2917.02,2926.32


In [305]:
X_train[:2]

Unnamed: 0,open,high,low,close
405,3513.65,3514.49,3509.61,3511.62
325,2963.48,2970.5,2960.24,2964.47


In [306]:
X_test[:2]

Unnamed: 0,open,high,low,close
106,2816.97,2823.28,2811.74,2811.76
211,2985.73,2985.74,2971.96,2976.32


#### Training Naive Bayes Model with Sklearn


In [307]:
from sklearn.naive_bayes import CategoricalNB
# Create an NB object 

classifier = CategoricalNB()

In [308]:
# fit the model 
classifier.fit(X_train, y_train)

CategoricalNB()

In [309]:
## test the model 
y_pred = classifier.predict(X_test)

# print the prediction compare with actual target values 

print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[2 0]
 [2 4]
 [2 2]
 [2 2]
 [0 8]
 [2 9]
 [0 2]
 [0 2]
 [2 0]
 [2 1]
 [2 0]
 [4 4]
 [2 0]
 [7 0]
 [2 0]
 [2 0]
 [2 2]
 [2 8]
 [2 2]
 [2 0]
 [2 2]
 [2 4]
 [2 0]
 [2 0]
 [2 0]
 [2 2]
 [2 4]
 [2 7]
 [2 2]
 [2 0]
 [0 6]
 [7 0]
 [0 0]
 [2 7]
 [2 0]
 [2 0]
 [0 4]
 [0 0]
 [2 0]
 [2 0]
 [0 8]
 [2 4]
 [2 3]
 [2 3]
 [2 0]
 [0 7]
 [2 6]
 [2 7]
 [2 3]
 [0 2]
 [0 2]
 [0 4]
 [0 0]
 [2 7]
 [2 6]
 [2 2]
 [2 2]
 [2 4]
 [2 9]
 [2 0]
 [2 3]
 [4 2]
 [2 0]
 [2 3]
 [2 0]
 [2 1]
 [2 4]
 [2 7]
 [0 4]
 [0 8]
 [4 2]
 [2 5]
 [2 7]
 [2 2]
 [2 4]
 [2 2]
 [2 0]
 [2 2]
 [2 2]
 [2 3]
 [2 4]
 [2 6]
 [2 7]
 [2 4]
 [0 4]
 [2 0]
 [2 2]
 [2 0]
 [2 4]
 [2 7]
 [0 8]
 [2 0]
 [2 2]
 [2 2]
 [2 0]
 [0 2]
 [0 0]
 [2 2]
 [2 0]
 [2 2]
 [2 2]
 [0 7]
 [2 6]
 [2 4]
 [2 0]
 [0 7]
 [2 1]
 [2 0]
 [4 5]
 [0 2]
 [0 2]
 [2 2]
 [2 2]
 [2 4]
 [2 4]
 [4 7]
 [0 2]
 [2 0]
 [2 2]
 [2 7]
 [4 2]
 [2 2]
 [2 8]
 [0 4]
 [0 0]
 [2 0]
 [2 2]
 [2 2]
 [0 7]
 [2 2]
 [0 7]
 [2 2]
 [0 0]
 [0 4]
 [2 0]
 [2 2]
 [0 1]
 [2 0]
 [2 8]
 [2 2]
 [2 2]
 [2 7]
 [2 2]

#### Confusion Matrix
Evaluate the model performance

In [310]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 7  0 34  0  0  0  0  2  0  0]
 [ 1  0  3  0  0  0  0  0  0  0]
 [ 9  0 33  0  3  0  0  0  0  0]
 [ 0  0  6  0  0  0  0  0  0  0]
 [ 6  0 15  0  2  0  0  0  0  0]
 [ 0  0  1  0  1  0  0  0  0  0]
 [ 1  0  4  0  0  0  0  0  0  0]
 [ 5  0 11  0  2  0  0  0  0  0]
 [ 4  0  4  0  0  0  0  0  0  0]
 [ 1  0  2  0  0  0  0  0  0  0]]


0.267515923566879

The Naive Bayes CategoricalNB model only performed at 26%. 


So we decide to try another Naive Bayes model GaussianNB

In [311]:

# We can scale the dataset when we doing GaussianNB
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [312]:
X_train[:4]

array([[-0.01961629, -0.02925767, -0.01282746, -0.02243903],
       [-0.84097487, -0.84054514, -0.83285764, -0.83867002],
       [-0.79061875, -0.79953261, -0.77959904, -0.78956041],
       [-0.34641599, -0.35467344, -0.34324572, -0.35328744]])

In [313]:
# Using Niave bayes GaussianNB
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)

GaussianNB()

In [314]:
# evaluate the model 
y_pred_gnb = gnb_classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred_gnb), 1), y_test.reshape(len(y_test), 1)), 1))

[[2 0]
 [2 4]
 [2 2]
 [2 2]
 [0 8]
 [2 9]
 [0 2]
 [0 2]
 [2 0]
 [2 1]
 [2 0]
 [4 4]
 [2 0]
 [7 0]
 [2 0]
 [2 0]
 [2 2]
 [2 8]
 [2 2]
 [2 0]
 [2 2]
 [2 4]
 [2 0]
 [2 0]
 [2 0]
 [2 2]
 [2 4]
 [2 7]
 [2 2]
 [2 0]
 [0 6]
 [7 0]
 [0 0]
 [2 7]
 [2 0]
 [2 0]
 [0 4]
 [0 0]
 [2 0]
 [2 0]
 [0 8]
 [2 4]
 [2 3]
 [2 3]
 [2 0]
 [0 7]
 [2 6]
 [2 7]
 [2 3]
 [0 2]
 [0 2]
 [0 4]
 [0 0]
 [2 7]
 [2 6]
 [2 2]
 [2 2]
 [2 4]
 [2 9]
 [2 0]
 [2 3]
 [4 2]
 [2 0]
 [2 3]
 [2 0]
 [2 1]
 [2 4]
 [2 7]
 [0 4]
 [0 8]
 [4 2]
 [2 5]
 [2 7]
 [2 2]
 [2 4]
 [2 2]
 [2 0]
 [2 2]
 [2 2]
 [2 3]
 [2 4]
 [2 6]
 [2 7]
 [2 4]
 [0 4]
 [2 0]
 [2 2]
 [2 0]
 [2 4]
 [2 7]
 [0 8]
 [2 0]
 [2 2]
 [2 2]
 [2 0]
 [0 2]
 [0 0]
 [2 2]
 [2 0]
 [2 2]
 [2 2]
 [0 7]
 [2 6]
 [2 4]
 [2 0]
 [0 7]
 [2 1]
 [2 0]
 [4 5]
 [0 2]
 [0 2]
 [2 2]
 [2 2]
 [2 4]
 [2 4]
 [4 7]
 [0 2]
 [2 0]
 [2 2]
 [2 7]
 [4 2]
 [2 2]
 [2 8]
 [0 4]
 [0 0]
 [2 0]
 [2 2]
 [2 2]
 [0 7]
 [2 2]
 [0 7]
 [2 2]
 [0 0]
 [0 4]
 [2 0]
 [2 2]
 [0 1]
 [2 0]
 [2 8]
 [2 2]
 [2 2]
 [2 7]
 [2 2]

In [315]:
cm = confusion_matrix(y_test, y_pred_gnb)
print(cm)
accuracy_score(y_test, y_pred_gnb)

[[24  0 19  0  0  0  0  0  0  0]
 [ 4  0  0  0  0  0  0  0  0  0]
 [25  0 20  0  0  0  0  0  0  0]
 [ 5  0  1  0  0  0  0  0  0  0]
 [ 9  0 14  0  0  0  0  0  0  0]
 [ 1  0  1  0  0  0  0  0  0  0]
 [ 3  0  2  0  0  0  0  0  0  0]
 [ 6  0 12  0  0  0  0  0  0  0]
 [ 6  0  2  0  0  0  0  0  0  0]
 [ 2  0  1  0  0  0  0  0  0  0]]


0.2802547770700637

The GaussianNB model was accurate 28% of the time

### Artificial Neural Network

We then decide to use Neural Network.

In [343]:
import tensorflow as tf 

tf.__version__

'2.10.0'

In [416]:
# data preparation for the neural network
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [368]:
X[:2]

array([[2923.76, 2927.33, 2923.54, 2925.51],
       [2919.35, 2919.78, 2901.65, 2903.85]])

In [369]:
y[:2]

array(['BLLHRM', 'BERKCK'], dtype=object)

In [371]:
len(y)

782

In [419]:
# encode y
le = LabelEncoder()
y = le.fit_transform(y)
y[:10]
# change y to categorical targets 
y = tf.keras.utils.to_categorical(y)

In [420]:
### split the dataset using strastify split 

X_train, X_test, y_train, y_test = train_test_split(X, y,  stratify=y, test_size=0.2, random_state=1)

In [421]:
len(y_train) + len(X_test)

782

In [422]:
len(y_train) + len(y_test)

782

In [423]:
### we have to scale all the features 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [424]:
### ANN
## Building the brain
ann = tf.keras.models.Sequential()

In [425]:
# adding the input layer and first hidden layer
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [426]:
# second layer 
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [427]:
# add output layer 
# we have 10 different targets so we using 10 unit at the output
ann.add(tf.keras.layers.Dense(units=10, activation='softmax'))

In [432]:
## Compiling the ANN

ann.compile(optimizer= 'adam', loss= 'categorical_crossentropy', metrics=['accuracy'])

### Traing the ANN on the Training Set

In [435]:
# training the model
ann_model = ann.fit(X_train, y_train, batch_size=32, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [437]:
# Create a DataFrame with the history dictionary
## Plotting the model loss
df_eva = pd.DataFrame(ann_model.history, index=range(1, len(ann_model.history["loss"]) + 1))

# Plot the loss
df_eva.plot(y="loss")



In [438]:

# Plot the accuracy
df_eva.plot(y="accuracy")

In [439]:
# Evaluate the model 

model_loss, model_accuracy = ann.evaluate(X_test, y_test, verbose=2)

# Display evaluation results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

5/5 - 0s - loss: 1.8289 - accuracy: 0.3057 - 126ms/epoch - 25ms/step
Loss: 1.8288660049438477, Accuracy: 0.30573248863220215


The Artificial Nueral Network Model perform a bit better than the Naive Bayes models

It was 30% accurate. 


