# 4.0 Modelling Baseline

Work book summary:

This work book looks at creating a benchmark for the two strategies. I will use a confusion matrix to categorise the results.

The presence of a pattern is supposed to indicate a market reversal. Therefore I will assume the pattern has decided to execute a trade. If the market then moves above the threshold it will be classfied as a true positive. If the threshold is not met then it will be recorded as a false positive.

Marubuzo: 
+ Accuracy:	    0.62
+ Precision:	0.62

Fractals: 
+ Accuracy:	    0.67
+ Precision:	0.67

---

# Contents

- [1.0 Marubuzo Model](#1.0-Marubuzo-Model)
    - [1.1 Load Data](#1.1-Load-Data)
    - [1.2 Get patterns](#1.2-Get-patterns)
    - [1.3 Train Test Split](#1.3-Train-Test-Split)
    - [1.4 Loop through patterns](#1.4-Loop-through-patterns)
    - [1.5 Process results](#1.5-Process-results)

- [2.0 Fractal Model ](#2.0-Fractal-Model)
    - [2.1 Load Data](#2.1-Load-Data)
    - [2.2 Get patterns](#2.2-Get-patterns)
    - [2.3 Train Test Split](#2.3-Train-Test-Split)
    - [2.4 Loop through patterns](#2.4-Loop-through-patterns)
    - [2.5 Process results](#2.5-Process-results)


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
import calendar

from statsmodels.tsa.stattools import adfuller
# from statsmodels.tsa.seasonal import seasonal_decompose
# from statsmodels.tsa.statespace.sarimax import SARIMAX
# from statsmodels.tsa.arima_model import ARIMA, ARMA, ARMAResults, ARIMAResults
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import r2_score, mean_squared_error
from pmdarima import auto_arima
# import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")


# from sklearn.metrics import mean_squared_error
from statsmodels.tools.eval_measures import rmse


In [2]:
pd.set_option('display.max_columns', None)

---

# 1.0 Marubuzo Model

## 1.1 Load Data

In [4]:
daily_maru = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/data/resampled/eur-usddailyMarubozu.csv', 
                    index_col='date', parse_dates=True)

In [5]:
daily_maru['date+5'] = pd.to_datetime(daily_maru['date+5'])

In [6]:
daily_maru.loc[daily_maru.index[1],'date+5']

Timestamp('2000-07-24 00:00:00')

In [7]:
type(daily_maru['date+5'][0])

pandas._libs.tslibs.timestamps.Timestamp

---

## 1.2 Get patterns

In [8]:
daily_pattern = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/patterns/dailyMarubozu.csv', 
                           parse_dates=True)

In [9]:
daily_pattern['pattern_end'] = pd.to_datetime(daily_pattern['pattern_end'])

In [10]:
daily_pattern.loc[1]

pattern_end   2000-10-20
Name: 1, dtype: datetime64[ns]

In [11]:
len(daily_pattern)

64

---

## 1.3 Train Test Split

In [12]:
# Create train test split based on the pattern date +5 more time frames
def create_train_test_split(date, time_series, model_info):
    test_end_date = time_series.loc[date,'date+5']
    
    train_test = time_series.loc[time_series.index <= test_end_date]
  
    target_value = time_series.loc[time_series.index == date,'exit_price'].item()
    
    train_test.insert(0, 'target_price', target_value)
    
    model_info['signal'] = time_series.loc[date,'marubozu']
    
    train_test.insert(0, 'signal', model_info['signal'])
    
    model_info['start'] = len(train_test)-5
    model_info['end'] = len(train_test)-1
    
    model_info['train'] = train_test.iloc[:model_info['start']]
    model_info['test'] = train_test.iloc[model_info['start']:]

    return model_info

In [13]:
# Does the price cross the threshold?
def meet_threshold(row):
    if row['signal'] == -1 and row['low'] <= row['target_price']:
        return -1
    elif row['signal'] == 1 and row['high'] >= row['target_price']:
        return 1    
    else:
        return 0

In [15]:
# Create dataframe to store outcomes
def create_results_outcomes_dataframe(test): #, predictions):    
    outcomes = pd.DataFrame()
    outcomes['low'] = test['low']
    outcomes['high'] = test['high']
#     outcomes['preds'] = predictions.values
    outcomes['target_price'] = test['target_price']
    outcomes['direction'] = test['signal']
    outcomes['correct_call'] = test.apply(meet_threshold, axis=1)

    return outcomes

In [16]:
# As its the benchmark then it is assumed that that a buy/sell decision is made
def classify(outcomes):
    
    
    if max(outcomes['direction']) == 1:
        
        if max(outcomes['correct_call']) == 0:
            return 'fp'
        elif max(outcomes['correct_call']) == 1:
            return 'tp'
        
    elif max(outcomes['direction']) == -1:
        
        if min(outcomes['correct_call']) == 0:
            return 'fp'
        elif min(outcomes['correct_call']) == -1:
            return 'tp'
        
    else:
        return 'ERROR'
    

## 3.4 Loop through patterns

In [17]:
# loop through the patterns, train test split, determine outcome and. save results

model_info = {"train":None,"test":None,"start":None,"end":None,"signal":None}
benchmark_results = []

for match in daily_pattern['pattern_end']:
    
    results_dict = {'name':None,'pattern':None,'date':None,
                   'time_frame':None,'RMSE':None,
                   'MSE':None, 'classification':None}
    
    results_dict['name'] = 'Bechmark: ' + str(match)
    results_dict['strategy'] = 'Maribozu'
    results_dict['time_frame'] = 'daily'

    model_info = create_train_test_split(match, daily_maru, model_info)

    if len(model_info['train']) < 10:
        continue

    outcomes = create_results_outcomes_dataframe(model_info['test'])

    results_dict['classification'] = classify(outcomes)

    benchmark_results.append(results_dict)



## 1.5 Process results

In [18]:
# create the confuson matrix by processing the data
def create_cm(results):
    
    res_cm = [[0,0],
              [0,0]]
    
    for result in results:
        res = result['classification']
        
        if res == 'tp':
            res_cm[0][0] += 1
        elif res == 'fp':
            res_cm[0][1] += 1
        elif res == 'fn':
            res_cm[1][0] += 1
        elif res == 'tn':
            res_cm[1][1] += 1
    
    return res_cm

In [19]:
# save confusion matrix
cm = create_cm(benchmark_results)

In [20]:
# display confusion matrix
cm_df = pd.DataFrame(cm, index=['pred_success', 'pred_non_success'], columns=['actual success', 'actual non_success'])
cm_df

Unnamed: 0,actual success,actual non_success
pred_success,40,24
pred_non_success,0,0


In [21]:
def print_metrics(cm):
    # Accuracy - how many did the model get right
    # Total number of correct predictions / total number of predictions
    acc= (cm[0][0]+cm[1][1])/(np.sum(cm))
    
    # Precision proportion of positive identifications that were actually correct
    # True positives/ true positives + false positives)
    prec = cm[0][0]/(cm[0][0]+cm[0][1])
    
    # Recall - proportion of actual positives that were correctly defined
    # True positives/ true positives + false negatives
    rec = cm[0][0]/(cm[0][0]+cm[1][0])

    print(f"Accuracy:\t{round(acc,2)}\nPrecision:\t{round(prec,2)}\nRecall:\t\t{round(rec,2)}")

In [22]:
# Display the results
print_metrics(cm)

Accuracy:	0.62
Precision:	0.62
Recall:		1.0


---

# 2.0 Fractal Model

## 2.1 Load Data

In [23]:
daily_fract = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/data/resampled/eur-usddailyfractals.csv', 
                    index_col='date', parse_dates=True)

In [24]:
daily_fract['date+5'] = pd.to_datetime(daily_fract['date+5'])

## 2.2 Get patterns

In [25]:
fractal_pattern = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/patterns/dailyfractals.csv', 
                           parse_dates=True)

In [26]:
fractal_pattern['pattern_end'] = pd.to_datetime(fractal_pattern['pattern_end'])

In [27]:
len(fractal_pattern)

613

---

## 2.3 Train Test Split

In [28]:
# Create train test split based on the pattern date +5 more time frames

def create_train_test_split(date, time_series, model_info):
    test_end_loc = time_series.index.get_loc(date) + 6

    train_test = time_series.iloc[:test_end_loc]

    target_value = time_series.loc[time_series.index == date,'exit_price'].item()
    
    train_test.insert(0, 'target_price', target_value)
    
    model_info['signal'] = time_series.loc[date,'fractal_end']
    
    train_test.insert(0, 'signal', model_info['signal'])
    
    model_info['start'] = len(train_test)-5
    model_info['end'] = len(train_test)-1
    
    model_info['train'] = train_test.iloc[:model_info['start']]
    model_info['test'] = train_test.iloc[model_info['start']:]

    return model_info

In [29]:
# Determine whether the signal is correct and the future price is surpassed
def meet_threshold(row):
    if row['signal'] == -1 and row['low'] <= row['target_price']:
        return -1
    elif row['signal'] == 1 and row['high'] >= row['target_price']:
#         print(f"row high: {row['high']} >= row dbl height: {row['target_price']}" )
        return 1    
    else:
        return 0

In [31]:
# create a dateframe of outcomes
def create_results_outcomes_dataframe(test):   
    outcomes = pd.DataFrame()
    outcomes['low'] = test['low']
    outcomes['high'] = test['high']
    outcomes['5_day_avg'] = test['5_day_avg']
    outcomes['open'] = test['open']
    outcomes['close'] = test['close']
    outcomes['target_price'] = test['target_price']
    outcomes['direction'] = test['signal']
    outcomes['correct_call'] = test.apply(meet_threshold, axis=1)

    return outcomes

In [32]:
# Determine the classfication of the strategy
# As its the benchmark then it is assumed that that a buy/sell decision is made

def classify(outcomes):
    
    if max(outcomes['direction']) == 1:
        
        if max(outcomes['correct_call']) == 0:
            return 'fp'
        elif max(outcomes['correct_call']) == 1:
            return 'tp'
        
    elif max(outcomes['direction']) == -1:
        
        if min(outcomes['correct_call']) == 0:
            return 'fp'
        elif min(outcomes['correct_call']) == -1:
            return 'tp'
        
    else:
        return 'ERROR'
    

## 2.4 Loop through patterns

In [33]:
# loop through the patterns, train test split, determine outcome and. save results

model_info = {"train":None,"test":None,"start":None,"end":None,"signal":None}
benchmark_results = []

for match in fractal_pattern['pattern_end']:

    results_dict = {'name':None,'pattern':None,'date':None,
                   'time_frame':None,'RMSE':None,
                   'MSE':None, 'classification':None}
    
    results_dict['name'] = 'Bechmark: ' + str(match)
    results_dict['strategy'] = 'Maribozu'
    results_dict['time_frame'] = 'daily'

    model_info = create_train_test_split(match, daily_fract, model_info)

    if len(model_info['train']) < 10:
        continue

    outcomes = create_results_outcomes_dataframe(model_info['test'])
    results_dict['classification'] = classify(outcomes)

    benchmark_results.append(results_dict)


---

## 2.5 Process results

In [34]:
# create the confuson matrix by processing the data

def create_cm(results):
    
    res_cm = [[0,0],
              [0,0]]
    
    for result in results:
        res = result['classification']
        
        if res == 'tp':
            res_cm[0][0] += 1
        elif res == 'fp':
            res_cm[0][1] += 1
        elif res == 'fn':
            res_cm[1][0] += 1
        elif res == 'tn':
            res_cm[1][1] += 1
    
    return res_cm

In [35]:
# create confusion matrix
cm = create_cm(benchmark_results)

In [36]:
# print confusion matrix
cm_df = pd.DataFrame(cm, index=['pred_success', 'pred_non_success'], columns=['actual success', 'actual non_success'])
cm_df

Unnamed: 0,actual success,actual non_success
pred_success,410,202
pred_non_success,0,0


In [37]:
def print_metrics(cm):
    # Accuracy - how many did the model get right
    # Total number of correct predictions / total number of predictions
    acc= (cm[0][0]+cm[1][1])/(np.sum(cm))
    
    # Precision proportion of positive identifications that were actually correct
    # True positives/ true positives + false positives)
    prec = cm[0][0]/(cm[0][0]+cm[0][1])
    
    # Recall - proportion of actual positives that were correctly defined
    # True positives/ true positives + false negatives
    rec = cm[0][0]/(cm[0][0]+cm[1][0])

    print(f"Accuracy:\t{round(acc,2)}\nPrecision:\t{round(prec,2)}\nRecall:\t\t{round(rec,2)}")

In [38]:
# Display the results
print_metrics(cm)

Accuracy:	0.67
Precision:	0.67
Recall:		1.0
