# 4.0 Modelling Baseline

---

# Contents

- [1.0 Arima Model](#1.0-ARIMA-Model)
- [2.0 Daily Data](#2.0-Daily-Data)
    - [1.1 Load Data](#2.1-Load-Data)
    - [1.2 Train Test Split](#2.2-Train-Test-Split)

In [1]:
# !pip install pmdarima

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
import calendar

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_model import ARIMA, ARMA, ARMAResults, ARIMAResults
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import r2_score, mean_squared_error
from pmdarima import auto_arima
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics import mean_squared_error
from statsmodels.tools.eval_measures import rmse


In [3]:
pd.set_option('display.max_columns', None)

---

# 1.0 ARIMA Model

The Arima model has 3 components:

+ Differencing Step - I - Integrated - Check for stationarity
+ Autoregressive Piece - AR - long term trends
+ Moving Average Piece - MA - Modelling sudden fluctuations

Each part has input into the model P,D,Q. I will use the previous Dickey Fuller 

+ D is the order of differencing we found using the Augmented Dickey-Fuller test.
+ P is the number of autoregressive terms in our model. PACF is used to estimate this.
+ Q is to do with looking at the moving average.
    + If PACF has a sharp cut off and lag-1 for the ACF is negative choose q to be the lag in the ACF before cut off.
    + If PACF does not have a sharp cut off or lag -1 ACF is not negative choose q = 0

Therefore based on the charts before I will use:
    
    + p = 1
    + d = 1
    + q = 0
    
However I will use auto_arima to help decide.

---

In [4]:
results = {'algo':'','name':'','date':'', 'time_frame':'','success':0,'RMSE':0, 'MSE':0, 'classification':'' }

##### 1.1 Load Data Fractals

In [5]:
daily_maru = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/data/resampled/eur-usddailyMarubozu.csv', 
                    index_col='date', parse_dates=True)

In [6]:
daily_maru.index

DatetimeIndex(['2000-07-14', '2000-07-17', '2000-07-18', '2000-07-19',
               '2000-07-20', '2000-07-21', '2000-07-24', '2000-07-25',
               '2000-07-26', '2000-07-27',
               ...
               '2019-12-11', '2019-12-12', '2019-12-13', '2019-12-16',
               '2019-12-17', '2019-12-18', '2019-12-19', '2019-12-20',
               '2019-12-23', '2019-12-24'],
              dtype='datetime64[ns]', name='date', length=4612, freq=None)

In [7]:
daily_maru.columns

Index(['open', 'high', 'low', 'close', 'mid', 'wk_mv_avg', 'mnth_mv_avg',
       'volatility_3_day', 'volatility_10_day', 'pct_chge_3_prds',
       'pct_chge_5_prds', 'pct_chge_10_prds', 'height', 'height-1', 'height-2',
       'height-3', 'direction', 'gold_usd', 'gold_euro', 'marubozu',
       'marubozu+1', 'marubozu-1', 'marubozu-2', 'day-1_open', 'day-2_open',
       'day-3_open', 'day-1_high', 'day-2_high', 'day-3_high', 'day-1_low',
       'day-2_low', 'day-3_low', 'day-1_close', 'day-2_close', 'day-3_close',
       'day+1_open', 'day+1_high', 'day+1_low', 'day+1_close', 'day+2_high',
       'day+2_low', 'day+3_high', 'day+3_low', 'day+4_high', 'day+4_low',
       'day+5_high', 'day+5_low', 'exit_price', 'select', 'target', 'date+5'],
      dtype='object')

In [8]:
#daily = daily.resample('B').agg({'open':'first','high':'max','low':'min', 'close':'last'})


In [9]:
daily_maru.index

DatetimeIndex(['2000-07-14', '2000-07-17', '2000-07-18', '2000-07-19',
               '2000-07-20', '2000-07-21', '2000-07-24', '2000-07-25',
               '2000-07-26', '2000-07-27',
               ...
               '2019-12-11', '2019-12-12', '2019-12-13', '2019-12-16',
               '2019-12-17', '2019-12-18', '2019-12-19', '2019-12-20',
               '2019-12-23', '2019-12-24'],
              dtype='datetime64[ns]', name='date', length=4612, freq=None)

In [10]:
daily_maru['date+5'] = pd.to_datetime(daily_maru['date+5'])

In [11]:
daily_maru.loc[daily_maru.index[1],'date+5']

Timestamp('2000-07-24 00:00:00')

In [12]:
type(daily_maru['date+5'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [13]:
### Get correct hyper parameters

In [14]:
## Arima
auto_arima(daily_maru['close'].dropna(), seasonal=False).summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,4612.0
Model:,"SARIMAX(0, 1, 0)",Log Likelihood,15826.945
Date:,"Wed, 05 Aug 2020",AIC,-31651.89
Time:,21:35:38,BIC,-31645.453
Sample:,0,HQIC,-31649.624
,- 4612,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
sigma2,6.11e-05,5.15e-07,118.588,0.000,6.01e-05,6.21e-05

0,1,2,3
Ljung-Box (Q):,61.33,Jarque-Bera (JB):,20105.01
Prob(Q):,0.02,Prob(JB):,0.0
Heteroskedasticity (H):,0.66,Skew:,-0.38
Prob(H) (two-sided):,0.0,Kurtosis:,13.2


---

## 1.1 Get patterns

In [15]:
daily_pattern = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/patterns/dailyMarubozu.csv', 
                           parse_dates=True)

In [16]:
daily_pattern['pattern_end'] = pd.to_datetime(daily_pattern['pattern_end'])

In [17]:
daily_pattern.loc[1]

pattern_end   2000-10-20
Name: 1, dtype: datetime64[ns]

In [18]:
len(daily_pattern)

64

---

In [19]:
def create_train_test_split(date, time_series, model_info):
    test_end_date = time_series.loc[date,'date+5']
    
    train_test = time_series.loc[time_series.index <= test_end_date]
  
    target_value = time_series.loc[time_series.index == date,'exit_price'].item()
    
    train_test.insert(0, 'target_price', target_value)
    
    model_info['signal'] = time_series.loc[date,'marubozu']
    
    train_test.insert(0, 'signal', model_info['signal'])
    
    model_info['start'] = len(train_test)-5
    model_info['end'] = len(train_test)-1
    
    model_info['train'] = train_test.iloc[:model_info['start']]
    model_info['test'] = train_test.iloc[model_info['start']:]

    return model_info

In [20]:
def meet_threshold(row):
    if row['signal'] == -1 and row['low'] <= row['target_price']:
        return -1
    elif row['signal'] == 1 and row['high'] >= row['target_price']:
#         print(f"row high: {row['high']} >= row dbl height: {row['target_price']}" )
        return 1    
    else:
        return 0

In [21]:
def ml_decision(row):
    if row['direction'] == -1 and row['preds'] <= row['target_price']:
        return -1
    elif row['direction'] == 1 and row['preds'] >= row['target_price']:
        print(f"preds: {row['preds']} >= row target: {row['target_price']}" )
        return 1    
    else:
        return 0

In [22]:
def create_results_outcomes_dataframe(test): #, predictions):    
    outcomes = pd.DataFrame()
    outcomes['low'] = test['low']
    outcomes['high'] = test['high']
#     outcomes['preds'] = predictions.values
    outcomes['target_price'] = test['target_price']
    outcomes['direction'] = test['signal']
    outcomes['correct_call'] = test.apply(meet_threshold, axis=1)

    return outcomes

In [23]:
def classify(outcomes):
    
    # As its the benchmark then it is assumed that that a buy/sell decision is made
    
    if max(outcomes['direction']) == 1:
        
        if max(outcomes['correct_call']) == 0:
            return 'fp'
        elif max(outcomes['correct_call']) == 1:
            return 'tp'
        
    elif max(outcomes['direction']) == -1:
        
        if min(outcomes['correct_call']) == 0:
            return 'fp'
        elif min(outcomes['correct_call']) == -1:
            return 'tp'
        
    else:
        return 'ERROR'
    

In [24]:
model_info = {"train":None,"test":None,"start":None,"end":None,"signal":None}
benchmark_results = []

for match in daily_pattern['pattern_end']:
    
    results_dict = {'name':None,'pattern':None,'date':None,
                   'time_frame':None,'RMSE':None,
                   'MSE':None, 'classification':None}
    
    results_dict['name'] = 'Bechmark: ' + str(match)
    results_dict['strategy'] = 'Maribozu'
    results_dict['time_frame'] = 'daily'

    model_info = create_train_test_split(match, daily_maru, model_info)

    if len(model_info['train']) < 10:
        continue

    outcomes = create_results_outcomes_dataframe(model_info['test'])

    results_dict['classification'] = classify(outcomes)

    benchmark_results.append(results_dict)



In [25]:
# arima_results

In [26]:
def create_cm(results):
    
    res_cm = [[0,0],
              [0,0]]
    
    for result in results:
        res = result['classification']
        
        if res == 'tp':
            res_cm[0][0] += 1
        elif res == 'fp':
            res_cm[0][1] += 1
        elif res == 'fn':
            res_cm[1][0] += 1
        elif res == 'tn':
            res_cm[1][1] += 1
    
    return res_cm

In [27]:
cm = create_cm(benchmark_results)

In [28]:
cm_df = pd.DataFrame(cm, index=['pred_success', 'pred_non_success'], columns=['actual success', 'actual non_success'])
cm_df

Unnamed: 0,actual success,actual non_success
pred_success,40,24
pred_non_success,0,0


In [29]:
def print_metrics(cm):
    # Accuracy - how many did the model get right
    # Total number of correct predictions / total number of predictions
    acc= (cm[0][0]+cm[1][1])/(np.sum(cm))
    
    # Precision proportion of positive identifications that were actually correct
    # True positives/ true positives + false positives)
    prec = cm[0][0]/(cm[0][0]+cm[0][1])
    
    # Recall - proportion of actual positives that were correctly defined
    # True positives/ true positives + false negatives
    rec = cm[0][0]/(cm[0][0]+cm[1][0])

    print(f"Accuracy:\t{round(acc,2)}\nPrecision:\t{round(prec,2)}\nRecall:\t\t{round(rec,2)}")

In [30]:
# Display the results
print_metrics(cm)

Accuracy:	0.62
Precision:	0.62
Recall:		1.0


---

## Baseline for Fractals

##### 5 Load Data

In [116]:
daily_fract = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/data/resampled/eur-usddailyfractals.csv', 
                    index_col='date', parse_dates=True)

In [117]:
daily_fract.index

DatetimeIndex(['2000-07-17', '2000-07-18', '2000-07-19', '2000-07-20',
               '2000-07-21', '2000-07-24', '2000-07-25', '2000-07-26',
               '2000-07-27', '2000-07-28',
               ...
               '2019-12-11', '2019-12-12', '2019-12-13', '2019-12-16',
               '2019-12-17', '2019-12-18', '2019-12-19', '2019-12-20',
               '2019-12-23', '2019-12-24'],
              dtype='datetime64[ns]', name='date', length=4632, freq=None)

In [118]:
daily_fract.columns

Index(['open', 'high', 'low', 'close', 'mid', 'wk_mv_avg', 'mnth_mv_avg',
       'volatility_3_day', 'volatility_10_day', 'pct_chge_3_prds',
       'pct_chge_5_prds', 'pct_chge_10_prds', 'height', 'height-1', 'height-2',
       'height-3', 'direction', 'gold_usd', 'gold_euro', 'day-1_high',
       'day-2_high', 'day-3_high', 'day-4_high', 'day-1_low', 'day-2_low',
       'day-3_low', 'day-4_low', 'day-1_open', 'day-2_open', 'day-3_open',
       'day-4_open', 'day-1_close', 'day-2_close', 'day-3_close',
       'day-4_close', 'day+1_open', 'day+1_high', 'day+1_low', 'day+1_close',
       'day+2_high', 'day+2_low', 'day+3_high', 'day+3_low', 'day+4_high',
       'day+4_low', 'day+5_high', 'day+5_low', '5_day_avg', 'fractal_end',
       'day+1_frac', 'day+2_frac', 'day+3_frac', 'day+4_frac', 'select',
       'exit_price', 'target', 'date+5', 'fractal'],
      dtype='object')

In [119]:
daily_fract.head()

Unnamed: 0_level_0,open,high,low,close,mid,wk_mv_avg,mnth_mv_avg,volatility_3_day,volatility_10_day,pct_chge_3_prds,pct_chge_5_prds,pct_chge_10_prds,height,height-1,height-2,height-3,direction,gold_usd,gold_euro,day-1_high,day-2_high,day-3_high,day-4_high,day-1_low,day-2_low,day-3_low,day-4_low,day-1_open,day-2_open,day-3_open,day-4_open,day-1_close,day-2_close,day-3_close,day-4_close,day+1_open,day+1_high,day+1_low,day+1_close,day+2_high,day+2_low,day+3_high,day+3_low,day+4_high,day+4_low,day+5_high,day+5_low,5_day_avg,fractal_end,day+1_frac,day+2_frac,day+3_frac,day+4_frac,select,exit_price,target,date+5,fractal
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
2000-07-17,0.9382,0.9402,0.9342,0.936,0.9371,0.94217,0.9471,0.004282,0.00344,-0.009408,-0.016013,-0.014875,0.0022,0.003,0.0062,0.0086,0,98.1,123.3,0.9389,0.9425,0.9517,0.9569,0.9318,0.933,0.9396,0.9496,0.9353,0.9416,0.9503,0.9545,0.9383,0.9354,0.9417,0.9504,0.9361,0.9368,0.9227,0.9256,0.927,0.9193,0.9342,0.9204,0.9384,0.9319,0.9367,0.9313,0.005,0,0.0,0.0,0.0,0.0,0,0.936,0.0,2000-07-24,0
2000-07-18,0.9361,0.9368,0.9227,0.9256,0.93085,0.93785,0.945633,0.003582,0.003797,-0.008151,-0.022678,-0.020519,0.0105,0.0022,0.003,0.0062,0,98.3,123.8,0.9402,0.9389,0.9425,0.9517,0.9342,0.9318,0.933,0.9396,0.9382,0.9353,0.9416,0.9503,0.936,0.9383,0.9354,0.9417,0.9255,0.927,0.9193,0.9246,0.9342,0.9204,0.9384,0.9319,0.9367,0.9313,0.9433,0.9329,0.0061,0,0.0,0.0,0.0,0.0,0,0.9256,0.0,2000-07-25,0
2000-07-19,0.9255,0.927,0.9193,0.9246,0.92505,0.93366,0.944207,0.003915,0.003618,-0.012543,-0.022146,-0.029125,0.0009,0.0105,0.0022,0.003,0,97.0,123.3,0.9368,0.9402,0.9389,0.9425,0.9227,0.9342,0.9318,0.933,0.9361,0.9382,0.9353,0.9416,0.9256,0.936,0.9383,0.9354,0.9245,0.9342,0.9204,0.9325,0.9384,0.9319,0.9367,0.9313,0.9433,0.9329,0.945,0.9391,0.005,0,0.0,0.0,0.0,0.0,0,0.9246,0.0,2000-07-26,0
2000-07-20,0.9245,0.9342,0.9204,0.9325,0.9285,0.93166,0.943221,0.005881,0.004167,-0.009177,-0.010655,-0.024531,0.008,0.0009,0.0105,0.0022,0,97.1,123.3,0.927,0.9368,0.9402,0.9389,0.9193,0.9227,0.9342,0.9318,0.9255,0.9361,0.9382,0.9353,0.9246,0.9256,0.936,0.9383,0.9324,0.9384,0.9319,0.9365,0.9367,0.9313,0.9433,0.9329,0.945,0.9391,0.9444,0.9314,0.005,0,0.0,0.0,0.0,0.0,0,0.9325,0.0,2000-07-27,0
2000-07-21,0.9324,0.9384,0.9319,0.9365,0.93445,0.93119,0.942879,0.00666,0.005033,0.003867,-0.002509,-0.016575,0.0041,0.008,0.0009,0.0105,0,97.1,122.2,0.9342,0.927,0.9368,0.9402,0.9204,0.9193,0.9227,0.9342,0.9245,0.9255,0.9361,0.9382,0.9325,0.9246,0.9256,0.936,0.9366,0.9367,0.9313,0.933,0.9433,0.9329,0.945,0.9391,0.9444,0.9314,0.9338,0.9229,0.00514,0,0.0,0.0,0.0,0.0,0,0.9365,0.0,2000-07-28,0


In [120]:
daily_fract.shape

(4632, 58)

In [121]:
daily_fract.index

DatetimeIndex(['2000-07-17', '2000-07-18', '2000-07-19', '2000-07-20',
               '2000-07-21', '2000-07-24', '2000-07-25', '2000-07-26',
               '2000-07-27', '2000-07-28',
               ...
               '2019-12-11', '2019-12-12', '2019-12-13', '2019-12-16',
               '2019-12-17', '2019-12-18', '2019-12-19', '2019-12-20',
               '2019-12-23', '2019-12-24'],
              dtype='datetime64[ns]', name='date', length=4632, freq=None)

In [122]:
daily_fract['date+5'] = pd.to_datetime(daily_fract['date+5'])

In [123]:
daily_fract.loc[daily_fract.index[1],'date+5']

Timestamp('2000-07-25 00:00:00')

In [124]:
type(daily_fract['date+5'][0])

pandas._libs.tslibs.timestamps.Timestamp

# Get fractal patterns

In [125]:
fractal_pattern = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/patterns/dailyfractals.csv', 
                           parse_dates=True)

In [126]:
fractal_pattern['pattern_end'] = pd.to_datetime(fractal_pattern['pattern_end'])

In [127]:
fractal_pattern.loc[1]

pattern_end   2000-08-22
Name: 1, dtype: datetime64[ns]

In [128]:
len(fractal_pattern)

299

---

In [129]:
fractal_pattern.loc[290]

pattern_end   2019-05-15
Name: 290, dtype: datetime64[ns]

In [131]:
daily_fract.loc[daily_fract.index == '2019-05-15']

Unnamed: 0_level_0,open,high,low,close,mid,wk_mv_avg,mnth_mv_avg,volatility_3_day,volatility_10_day,pct_chge_3_prds,pct_chge_5_prds,pct_chge_10_prds,height,height-1,height-2,height-3,direction,gold_usd,gold_euro,day-1_high,day-2_high,day-3_high,day-4_high,day-1_low,day-2_low,day-3_low,day-4_low,day-1_open,day-2_open,day-3_open,day-4_open,day-1_close,day-2_close,day-3_close,day-4_close,day+1_open,day+1_high,day+1_low,day+1_close,day+2_high,day+2_low,day+3_high,day+3_low,day+4_high,day+4_low,day+5_high,day+5_low,5_day_avg,fractal_end,day+1_frac,day+2_frac,day+3_frac,day+4_frac,select,exit_price,target,date+5,fractal
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
2019-05-15,1.12085,1.12245,1.11779,1.12085,1.12085,1.122131,1.121067,0.000943,0.001272,-0.001875,0.001,-0.000321,0.0,0.00309,0.00095,1e-05,1,451.4,473.0,1.12439,1.12635,1.12536,1.12509,1.12007,1.12191,1.12177,1.11734,1.12393,1.12297,1.12295,1.11908,1.12084,1.12392,1.12296,1.12296,1.12086,1.12238,1.11662,1.11773,1.11841,1.11531,1.11748,1.11506,1.1188,1.11419,1.11802,1.1148,0.005,1,0.0,0.0,0.0,0.0,1,1.12585,1.12238,2019-05-22,1


In [96]:
def create_train_test_split(date, time_series, model_info):
#     test_end_date = time_series.loc[date,'date+5']
    test_end_loc = time_series.index.get_loc(date) + 6

#     train_test = time_series.loc[time_series.index <= test_end_date]
    train_test = time_series.iloc[:test_end_loc]

    target_value = time_series.loc[time_series.index == date,'exit_price'].item()
    
    train_test.insert(0, 'target_price', target_value)
    
    model_info['signal'] = time_series.loc[date,'fractal_end']
    
    train_test.insert(0, 'signal', model_info['signal'])
    
    model_info['start'] = len(train_test)-5
    model_info['end'] = len(train_test)-1
    
    model_info['train'] = train_test.iloc[:model_info['start']]
    model_info['test'] = train_test.iloc[model_info['start']:]

    return model_info

In [97]:
def meet_threshold(row):
    if row['signal'] == -1 and row['low'] <= row['target_price']:
        return -1
    elif row['signal'] == 1 and row['high'] >= row['target_price']:
#         print(f"row high: {row['high']} >= row dbl height: {row['target_price']}" )
        return 1    
    else:
        return 0

In [98]:
# def get_5_day_price(row):
    

In [99]:
def ml_decision(row):
    if row['direction'] == -1 and row['preds'] <= row['target_price']:
        return -1
    elif row['direction'] == 1 and row['preds'] >= row['target_price']:
        print(f"preds: {row['preds']} >= row target: {row['target_price']}" )
        return 1    
    else:
        return 0

In [111]:
def create_results_outcomes_dataframe(test): #, predictions):    
    outcomes = pd.DataFrame()
    outcomes['low'] = test['low']
    outcomes['high'] = test['high']
    outcomes['5_day_avg'] = test['5_day_avg']
    outcomes['open'] = test['open']
    outcomes['close'] = test['close']
    outcomes['target_price'] = test['target_price']
    outcomes['direction'] = test['signal']
    outcomes['correct_call'] = test.apply(meet_threshold, axis=1)

    return outcomes

In [112]:
def classify(outcomes):
    
    # As its the benchmark then it is assumed that that a buy/sell decision is made
    
    if max(outcomes['direction']) == 1:
        
        if max(outcomes['correct_call']) == 0:
            return 'fp'
        elif max(outcomes['correct_call']) == 1:
            return 'tp'
        
    elif max(outcomes['direction']) == -1:
        
        if min(outcomes['correct_call']) == 0:
            return 'fp'
        elif min(outcomes['correct_call']) == -1:
            return 'tp'
        
    else:
        return 'ERROR'
    

In [132]:
daily_fract.loc[daily_fract.index == '2000-07-28']

Unnamed: 0_level_0,open,high,low,close,mid,wk_mv_avg,mnth_mv_avg,volatility_3_day,volatility_10_day,pct_chge_3_prds,pct_chge_5_prds,pct_chge_10_prds,height,height-1,height-2,height-3,direction,gold_usd,gold_euro,day-1_high,day-2_high,day-3_high,day-4_high,day-1_low,day-2_low,day-3_low,day-4_low,day-1_open,day-2_open,day-3_open,day-4_open,day-1_close,day-2_close,day-3_close,day-4_close,day+1_open,day+1_high,day+1_low,day+1_close,day+2_high,day+2_low,day+3_high,day+3_low,day+4_high,day+4_low,day+5_high,day+5_low,5_day_avg,fractal_end,day+1_frac,day+2_frac,day+3_frac,day+4_frac,select,exit_price,target,date+5,fractal
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
2000-07-28,0.932,0.9338,0.9229,0.9241,0.92805,0.93597,0.941517,0.008063,0.005738,-0.009605,-0.006849,-0.00934,0.0079,0.0115,0.0024,0.0083,1,96.6,122.4,0.9444,0.945,0.9433,0.9367,0.9314,0.9391,0.9329,0.9313,0.9434,0.9411,0.9329,0.9366,0.9319,0.9435,0.9412,0.933,0.9241,0.9295,0.9224,0.9274,0.9293,0.9135,0.9192,0.9117,0.9174,0.8997,0.9103,0.9015,0.00674,1,0.0,0.0,0.0,0.0,1,0.93084,0.9295,2000-08-04,1


In [134]:
model_info = {"train":None,"test":None,"start":None,"end":None,"signal":None}
benchmark_results = []

for match in fractal_pattern['pattern_end']:
#     print(match)
    results_dict = {'name':None,'pattern':None,'date':None,
                   'time_frame':None,'RMSE':None,
                   'MSE':None, 'classification':None}
    
    results_dict['name'] = 'Bechmark: ' + str(match)
    results_dict['strategy'] = 'Maribozu'
    results_dict['time_frame'] = 'daily'

    model_info = create_train_test_split(match, daily_fract, model_info)

    if len(model_info['train']) < 10:
        continue

    outcomes = create_results_outcomes_dataframe(model_info['test'])
    results_dict['classification'] = classify(outcomes)
#     print(results_dict['classification'])
#     print(outcomes)

    benchmark_results.append(results_dict)


In [135]:
def create_cm(results):
    
    res_cm = [[0,0],
              [0,0]]
    
    for result in results:
        res = result['classification']
        
        if res == 'tp':
            res_cm[0][0] += 1
        elif res == 'fp':
            res_cm[0][1] += 1
        elif res == 'fn':
            res_cm[1][0] += 1
        elif res == 'tn':
            res_cm[1][1] += 1
    
    return res_cm

In [136]:
cm = create_cm(benchmark_results)

In [137]:
cm_df = pd.DataFrame(cm, index=['pred_success', 'pred_non_success'], columns=['actual success', 'actual non_success'])
cm_df

Unnamed: 0,actual success,actual non_success
pred_success,191,108
pred_non_success,0,0


In [138]:
def print_metrics(cm):
    # Accuracy - how many did the model get right
    # Total number of correct predictions / total number of predictions
    acc= (cm[0][0]+cm[1][1])/(np.sum(cm))
    
    # Precision proportion of positive identifications that were actually correct
    # True positives/ true positives + false positives)
    prec = cm[0][0]/(cm[0][0]+cm[0][1])
    
    # Recall - proportion of actual positives that were correctly defined
    # True positives/ true positives + false negatives
    rec = cm[0][0]/(cm[0][0]+cm[1][0])

    print(f"Accuracy:\t{round(acc,2)}\nPrecision:\t{round(prec,2)}\nRecall:\t\t{round(rec,2)}")

In [139]:
# Display the results
print_metrics(cm)

Accuracy:	0.64
Precision:	0.64
Recall:		1.0


## 1.2 Train Test Split

In [None]:
type(daily.loc[daily.index == daily_pattern.loc[10]['pattern_end']].index[0])

In [None]:
daily.loc[daily.index == daily_pattern.loc[10]['pattern_end']].index[0]

In [None]:
daily_pattern.index

In [None]:
# Test 1 date out
curr_pattern = daily.loc[daily.index == daily_pattern.loc[10]['pattern_end']].index[0]
curr_pattern

In [None]:
daily.index

In [None]:
test_end_date = daily.loc[daily.loc[daily.index == daily_pattern.loc[10]['pattern_end']].index[0],'date+5']
test_end_date

In [None]:
# daily.loc[daily.index == curr_pattern]

In [None]:
train_test = daily.loc[daily.index <= test_end_date]
# train_test = daily.loc[daily.index <= '2004-2-28 00:00:00']

In [None]:
# daily.loc[daily.index <= end_date]

In [None]:
# daily.loc[daily.index == daily_pattern.loc[10]['pattern_end'],'double_height']

In [None]:
target_value = daily.loc[daily.index == daily_pattern.loc[10,'pattern_end'],'double_height'].item()
target_value

In [None]:
# def choose_exit_price(row, target_price, signal=-1):
#     if signal == -1:
#         return target_price
# #         return row['close'] - (row['height'] * 1)
#     else:
#         return target_price

# #         return row['close'] + (row['height'] * 1)

In [None]:
train_test

In [None]:
# train_test['double_height'] = train_test.apply(choose_exit_price, axis=1)
# train_test['double_height'] = daily.loc[daily.index == daily_pattern.loc[10,'pattern_end'],'double_height'].item()
#train_test.loc['double_height'] = [target_value for x in train_test.loc[:,['double_height']]]
train_test.insert(0, 'target_price', target_value)
# train_test.insert(0, 'signal', signal)

In [None]:
signal = daily.loc[daily.index == daily_pattern.loc[10,'pattern_end'],'marubozu'].item()
signal

In [None]:
train_test.head()

In [None]:
[signal] * (len(train_test)-1)

In [None]:
#train_test.loc[:,['signal']] = [signal] * (len(train_test))
# train_test.loc[:,['signal']] = [signal]
# df.insert(0, 'A', 'foo')
train_test.insert(0, 'signal', signal)

In [None]:
train_test.tail(6)

In [None]:
# start=len(train)
# end=len(train)+len(test)-1
start = len(train_test)-5
end = len(train_test)-1
start, end

In [None]:
# Set for testing
train = train_test.iloc[:start]
test = train_test.iloc[start:]

In [None]:
test.head()

In [None]:
def train_test_plot(train, test):
    plt.figure(figsize=(16, 8))
    plt.plot(train, c='blue')
    plt.plot(test, c='orange');

In [None]:
# This plot confirms that our train test split makes sense
train_test_plot(train['close'], test['close'])

In [None]:
auto_arima(daily['close'].dropna(), seasonal=False).summary()

In [None]:
train

In [None]:
model = ARIMA(train['low'], order=(0,1,0))
results = model.fit()
results.summary()

In [None]:
predictions = results.predict(start=start, end=end, dynamic=False, typ='levels').rename('ARIMA-0-1-0 Predictions')

In [None]:
predictions.values

In [None]:
type(predictions)

In [None]:
def justified(row):
    
    if row['signal'] == -1 and row['low'] <= row['target_price']:
        return 1
    elif row['signal'] == 1 and row['high'] >= row['target_price']:
        return 1    
    else:
        return 0

In [None]:
outcomes = pd.DataFrame()
outcomes['low'] = test['low']
outcomes['high'] = test['high']

outcomes['preds'] = predictions.values
outcomes['target_price'] = test['target_price']
# outcomes['direction'] = test['signal']
outcomes['signal_match'] = test.apply(justified, axis=1)

#daily_pre['target_price'] = daily_pre.apply(choose_exit_price, axis=1)
# outcomes.append(predictions, ignore_index=True)
outcomes

In [None]:
# predictions['date']  = test.index
#predictions.reset_index(test.index)

In [None]:
# predictions.reindex(test.index)

In [None]:
type(predictions)

In [None]:
test.head()['close'].isnull().sum()

In [None]:
train.head()['close'].isnull().sum()

In [None]:
outcomes['low'].plot(legend=True, figsize=(12,8))
outcomes['preds'].plot(legend=True);
outcomes['target_price'].plot(legend=True);

# predictions.plot(legend=True)

In [None]:

error = mean_squared_error(test['close'], predictions)
print(f'ARIMA(0,1,0) MSE Error: {error:11.10}')


error = rmse(test['close'], predictions)
print(f'ARIMA(0,1,0) RMSE Error: {error:11.10}')

In [None]:
results = {'algo':'','name':'','date':'', 'time_frame':'','success':'','RMSE':'', 'MSE':'', 'classification':'' }


In [None]:
daily.columns

---

# SARIMAX


In [None]:
# daily = daily.resample('B').agg({'open':'first','high':'max',
#                                         'low':'min', 'close':'last'})

In [None]:
daily.index

In [None]:
daily['close'].dropna(inplace=True)

In [None]:
result = seasonal_decompose(daily['close'], model='add', period=400 )
result.plot();

In [None]:
%%time
auto_arima(daily['close'], seasonal=True, maxiter=10000).summary()

In [None]:
model = SARIMAX(train['close'], order=(0,1,0), seasonal_order=(1,0,1,12))

In [None]:
len(train)

In [None]:
train.columns

In [None]:
# Starting MSE and (P, D, Q).
mse = 99 * (10 ** 16)
final_P = 0
final_D = 0
final_Q = 0

for P in range(3):
    for Q in range(3):
        for D in range(3):
            try:
                # Instantiate SARIMA model.
                sarima = SARIMAX(endog = train['close'],
                                 order = (0, 1, 0),              # (p, d, q)
                                 seasonal_order = (P, D, Q, 12)) # (P, D, Q, S)

                # Fit SARIMA model.
                model = sarima.fit()

                # Generate predictions based on training set.
                # Start at time period 0 and end at 1028.
                preds = model.predict(start=0, end=1028)

                # Evaluate predictions.
                print(f'The MSE for (1, 0, 0)x({P},{D},{Q},12) is: {mean_squared_error(train["close"], preds)}')
                
                # Save for final report.
                if mse > mean_squared_error(train['close'], preds):
                    mse = mean_squared_error(train['close'], preds)
                    final_P = P
                    final_D = D
                    final_Q = Q
                
            except:
                print(f"p: {P}, D: {D}, Q: {Q}")
                pass

print(f'Our model that minimizes MSE on the training data is the SARIMA(1, 0, 0)x({final_P},{final_D},{final_Q},420).')
print(f'This model has an MSE of {mse}.')