# 4.0 Modelling Baseline

Work book summary:

This work book looks at creating a benchmark for the two strategies. I will use a confusion matrix to categorise the results.

The presence of a pattern is supposed to indicate a market reversal. Therefore I will assume the pattern has decided to execute a trade. If the market then moves above the threshold it will be classfied as a true positive. If the threshold is not met then it will be recorded as a false positive.

Marubuzo: 
+ Accuracy:	    0.62
+ Precision:	0.62

Fractals: 
+ Accuracy:	    0.67
+ Precision:	0.67

---

# Contents

- [1.0 Marubuzo Model](#1.0-Marubuzo-Model)
    - [1.1 Load Data](#1.1-Load-Data)
    - [1.2 Get patterns](#1.2-Get-patterns)
    - [1.3 Train Test Split](#1.3-Train-Test-Split)
    - [1.4 Loop through patterns](#1.4-Loop-through-patterns)
    - [1.5 Process results](#1.5-Process-results)

- [2.0 Fractal Model ](#2.0-Fractal-Model)
    - [2.1 Load Data](#2.1-Load-Data)
    - [2.2 Get patterns](#2.2-Get-patterns)
    - [2.3 Train Test Split](#2.3-Train-Test-Split)
    - [2.4 Loop through patterns](#2.4-Loop-through-patterns)
    - [2.5 Process results](#2.5-Process-results)


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
import calendar

from statsmodels.tsa.stattools import adfuller
# from statsmodels.tsa.seasonal import seasonal_decompose
# from statsmodels.tsa.statespace.sarimax import SARIMAX
# from statsmodels.tsa.arima_model import ARIMA, ARMA, ARMAResults, ARIMAResults
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import r2_score, mean_squared_error
from pmdarima import auto_arima
# import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")


# from sklearn.metrics import mean_squared_error
from statsmodels.tools.eval_measures import rmse


In [2]:
pd.set_option('display.max_columns', None)

---

# 1.0 Marubuzo Model

## 1.1 Load Data

In [4]:
daily_maru = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/data/resampled/eur-usddailyMarubozu.csv', 
                    index_col='date', parse_dates=True)

In [5]:
daily_maru['date+5'] = pd.to_datetime(daily_maru['date+5'])

In [6]:
daily_maru.loc[daily_maru.index[1],'date+5']

Timestamp('2000-07-24 00:00:00')

In [7]:
type(daily_maru['date+5'][0])

pandas._libs.tslibs.timestamps.Timestamp

---

## 1.2 Get patterns

In [8]:
daily_pattern = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/patterns/dailyMarubozu.csv', 
                           parse_dates=True)

In [9]:
daily_pattern['pattern_end'] = pd.to_datetime(daily_pattern['pattern_end'])

In [10]:
daily_pattern.loc[1]

pattern_end   2000-10-20
Name: 1, dtype: datetime64[ns]

In [11]:
len(daily_pattern)

64

---

## 1.3 Train Test Split

In [12]:
# Create train test split based on the pattern date +5 more time frames
def create_train_test_split(date, time_series, model_info):
    test_end_date = time_series.loc[date,'date+5']
    
    train_test = time_series.loc[time_series.index <= test_end_date]
  
    target_value = time_series.loc[time_series.index == date,'exit_price'].item()
    
    train_test.insert(0, 'target_price', target_value)
    
    model_info['signal'] = time_series.loc[date,'marubozu']
    
    train_test.insert(0, 'signal', model_info['signal'])
    
    model_info['start'] = len(train_test)-5
    model_info['end'] = len(train_test)-1
    
    model_info['train'] = train_test.iloc[:model_info['start']]
    model_info['test'] = train_test.iloc[model_info['start']:]

    return model_info

In [13]:
# Does the price cross the threshold?
def meet_threshold(row):
    if row['signal'] == -1 and row['low'] <= row['target_price']:
        return -1
    elif row['signal'] == 1 and row['high'] >= row['target_price']:
        return 1    
    else:
        return 0

In [15]:
# Create dataframe to store outcomes
def create_results_outcomes_dataframe(test): #, predictions):    
    outcomes = pd.DataFrame()
    outcomes['low'] = test['low']
    outcomes['high'] = test['high']
#     outcomes['preds'] = predictions.values
    outcomes['target_price'] = test['target_price']
    outcomes['direction'] = test['signal']
    outcomes['correct_call'] = test.apply(meet_threshold, axis=1)

    return outcomes

In [16]:
# As its the benchmark then it is assumed that that a buy/sell decision is made
def classify(outcomes):
    
    
    if max(outcomes['direction']) == 1:
        
        if max(outcomes['correct_call']) == 0:
            return 'fp'
        elif max(outcomes['correct_call']) == 1:
            return 'tp'
        
    elif max(outcomes['direction']) == -1:
        
        if min(outcomes['correct_call']) == 0:
            return 'fp'
        elif min(outcomes['correct_call']) == -1:
            return 'tp'
        
    else:
        return 'ERROR'
    

## 3.4 Loop through patterns

In [17]:
# loop through the patterns, train test split, determine outcome and. save results

model_info = {"train":None,"test":None,"start":None,"end":None,"signal":None}
benchmark_results = []

for match in daily_pattern['pattern_end']:
    
    results_dict = {'name':None,'pattern':None,'date':None,
                   'time_frame':None,'RMSE':None,
                   'MSE':None, 'classification':None}
    
    results_dict['name'] = 'Bechmark: ' + str(match)
    results_dict['strategy'] = 'Maribozu'
    results_dict['time_frame'] = 'daily'

    model_info = create_train_test_split(match, daily_maru, model_info)

    if len(model_info['train']) < 10:
        continue

    outcomes = create_results_outcomes_dataframe(model_info['test'])

    results_dict['classification'] = classify(outcomes)

    benchmark_results.append(results_dict)



## 1.5 Process results

In [18]:
# create the confuson matrix by processing the data
def create_cm(results):
    
    res_cm = [[0,0],
              [0,0]]
    
    for result in results:
        res = result['classification']
        
        if res == 'tp':
            res_cm[0][0] += 1
        elif res == 'fp':
            res_cm[0][1] += 1
        elif res == 'fn':
            res_cm[1][0] += 1
        elif res == 'tn':
            res_cm[1][1] += 1
    
    return res_cm

In [19]:
# save confusion matrix
cm = create_cm(benchmark_results)

In [20]:
# display confusion matrix
cm_df = pd.DataFrame(cm, index=['pred_success', 'pred_non_success'], columns=['actual success', 'actual non_success'])
cm_df

Unnamed: 0,actual success,actual non_success
pred_success,40,24
pred_non_success,0,0


In [21]:
def print_metrics(cm):
    # Accuracy - how many did the model get right
    # Total number of correct predictions / total number of predictions
    acc= (cm[0][0]+cm[1][1])/(np.sum(cm))
    
    # Precision proportion of positive identifications that were actually correct
    # True positives/ true positives + false positives)
    prec = cm[0][0]/(cm[0][0]+cm[0][1])
    
    # Recall - proportion of actual positives that were correctly defined
    # True positives/ true positives + false negatives
    rec = cm[0][0]/(cm[0][0]+cm[1][0])

    print(f"Accuracy:\t{round(acc,2)}\nPrecision:\t{round(prec,2)}\nRecall:\t\t{round(rec,2)}")

In [22]:
# Display the results
print_metrics(cm)

Accuracy:	0.62
Precision:	0.62
Recall:		1.0


---

# 2.0 Fractal Model

## 2.1 Load Data

In [23]:
daily_fract = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/data/resampled/eur-usddailyfractals.csv', 
                    index_col='date', parse_dates=True)

In [24]:
daily_fract['date+5'] = pd.to_datetime(daily_fract['date+5'])

## 2.2 Get patterns

In [25]:
fractal_pattern = pd.read_csv('/Users/stuartdaw/Documents/Capstone_data/patterns/dailyfractals.csv', 
                           parse_dates=True)

In [26]:
fractal_pattern['pattern_end'] = pd.to_datetime(fractal_pattern['pattern_end'])

In [27]:
len(fractal_pattern)

613

---

## 2.3 Train Test Split

In [28]:
# Create train test split based on the pattern date +5 more time frames

def create_train_test_split(date, time_series, model_info):
    test_end_loc = time_series.index.get_loc(date) + 6

    train_test = time_series.iloc[:test_end_loc]

    target_value = time_series.loc[time_series.index == date,'exit_price'].item()
    
    train_test.insert(0, 'target_price', target_value)
    
    model_info['signal'] = time_series.loc[date,'fractal_end']
    
    train_test.insert(0, 'signal', model_info['signal'])
    
    model_info['start'] = len(train_test)-5
    model_info['end'] = len(train_test)-1
    
    model_info['train'] = train_test.iloc[:model_info['start']]
    model_info['test'] = train_test.iloc[model_info['start']:]

    return model_info

In [29]:
# Determine whether the signal is correct and the future price is surpassed
def meet_threshold(row):
    if row['signal'] == -1 and row['low'] <= row['target_price']:
        return -1
    elif row['signal'] == 1 and row['high'] >= row['target_price']:
#         print(f"row high: {row['high']} >= row dbl height: {row['target_price']}" )
        return 1    
    else:
        return 0

In [31]:
# create a dateframe of outcomes
def create_results_outcomes_dataframe(test):   
    outcomes = pd.DataFrame()
    outcomes['low'] = test['low']
    outcomes['high'] = test['high']
    outcomes['5_day_avg'] = test['5_day_avg']
    outcomes['open'] = test['open']
    outcomes['close'] = test['close']
    outcomes['target_price'] = test['target_price']
    outcomes['direction'] = test['signal']
    outcomes['correct_call'] = test.apply(meet_threshold, axis=1)

    return outcomes

In [32]:
# Determine the classfication of the strategy
# As its the benchmark then it is assumed that that a buy/sell decision is made

def classify(outcomes):
    
    if max(outcomes['direction']) == 1:
        
        if max(outcomes['correct_call']) == 0:
            return 'fp'
        elif max(outcomes['correct_call']) == 1:
            return 'tp'
        
    elif max(outcomes['direction']) == -1:
        
        if min(outcomes['correct_call']) == 0:
            return 'fp'
        elif min(outcomes['correct_call']) == -1:
            return 'tp'
        
    else:
        return 'ERROR'
    

## 2.4 Loop through patterns

In [33]:
# loop through the patterns, train test split, determine outcome and. save results

model_info = {"train":None,"test":None,"start":None,"end":None,"signal":None}
benchmark_results = []

for match in fractal_pattern['pattern_end']:

    results_dict = {'name':None,'pattern':None,'date':None,
                   'time_frame':None,'RMSE':None,
                   'MSE':None, 'classification':None}
    
    results_dict['name'] = 'Bechmark: ' + str(match)
    results_dict['strategy'] = 'Maribozu'
    results_dict['time_frame'] = 'daily'

    model_info = create_train_test_split(match, daily_fract, model_info)

    if len(model_info['train']) < 10:
        continue

    outcomes = create_results_outcomes_dataframe(model_info['test'])
    results_dict['classification'] = classify(outcomes)

    benchmark_results.append(results_dict)


---

## 2.5 Process results

In [34]:
# create the confuson matrix by processing the data

def create_cm(results):
    
    res_cm = [[0,0],
              [0,0]]
    
    for result in results:
        res = result['classification']
        
        if res == 'tp':
            res_cm[0][0] += 1
        elif res == 'fp':
            res_cm[0][1] += 1
        elif res == 'fn':
            res_cm[1][0] += 1
        elif res == 'tn':
            res_cm[1][1] += 1
    
    return res_cm

In [35]:
# create confusion matrix
cm = create_cm(benchmark_results)

In [36]:
# print confusion matrix
cm_df = pd.DataFrame(cm, index=['pred_success', 'pred_non_success'], columns=['actual success', 'actual non_success'])
cm_df

Unnamed: 0,actual success,actual non_success
pred_success,410,202
pred_non_success,0,0


In [37]:
def print_metrics(cm):
    # Accuracy - how many did the model get right
    # Total number of correct predictions / total number of predictions
    acc= (cm[0][0]+cm[1][1])/(np.sum(cm))
    
    # Precision proportion of positive identifications that were actually correct
    # True positives/ true positives + false positives)
    prec = cm[0][0]/(cm[0][0]+cm[0][1])
    
    # Recall - proportion of actual positives that were correctly defined
    # True positives/ true positives + false negatives
    rec = cm[0][0]/(cm[0][0]+cm[1][0])

    print(f"Accuracy:\t{round(acc,2)}\nPrecision:\t{round(prec,2)}\nRecall:\t\t{round(rec,2)}")

In [38]:
# Display the results
print_metrics(cm)

Accuracy:	0.67
Precision:	0.67
Recall:		1.0


## 1.2 Train Test Split

In [39]:
# type(daily.loc[daily.index == daily_pattern.loc[10]['pattern_end']].index[0])

In [40]:
daily.loc[daily.index == daily_pattern.loc[10]['pattern_end']].index[0]

NameError: name 'daily' is not defined

In [None]:
daily_pattern.index

In [None]:
# Test 1 date out
curr_pattern = daily.loc[daily.index == daily_pattern.loc[10]['pattern_end']].index[0]
curr_pattern

In [None]:
daily.index

In [None]:
test_end_date = daily.loc[daily.loc[daily.index == daily_pattern.loc[10]['pattern_end']].index[0],'date+5']
test_end_date

In [None]:
# daily.loc[daily.index == curr_pattern]

In [None]:
train_test = daily.loc[daily.index <= test_end_date]
# train_test = daily.loc[daily.index <= '2004-2-28 00:00:00']

In [None]:
# daily.loc[daily.index <= end_date]

In [None]:
# daily.loc[daily.index == daily_pattern.loc[10]['pattern_end'],'double_height']

In [None]:
target_value = daily.loc[daily.index == daily_pattern.loc[10,'pattern_end'],'double_height'].item()
target_value

In [None]:
# def choose_exit_price(row, target_price, signal=-1):
#     if signal == -1:
#         return target_price
# #         return row['close'] - (row['height'] * 1)
#     else:
#         return target_price

# #         return row['close'] + (row['height'] * 1)

In [None]:
train_test

In [None]:
# train_test['double_height'] = train_test.apply(choose_exit_price, axis=1)
# train_test['double_height'] = daily.loc[daily.index == daily_pattern.loc[10,'pattern_end'],'double_height'].item()
#train_test.loc['double_height'] = [target_value for x in train_test.loc[:,['double_height']]]
train_test.insert(0, 'target_price', target_value)
# train_test.insert(0, 'signal', signal)

In [None]:
signal = daily.loc[daily.index == daily_pattern.loc[10,'pattern_end'],'marubozu'].item()
signal

In [None]:
train_test.head()

In [None]:
[signal] * (len(train_test)-1)

In [None]:
#train_test.loc[:,['signal']] = [signal] * (len(train_test))
# train_test.loc[:,['signal']] = [signal]
# df.insert(0, 'A', 'foo')
train_test.insert(0, 'signal', signal)

In [None]:
train_test.tail(6)

In [None]:
# start=len(train)
# end=len(train)+len(test)-1
start = len(train_test)-5
end = len(train_test)-1
start, end

In [None]:
# Set for testing
train = train_test.iloc[:start]
test = train_test.iloc[start:]

In [None]:
test.head()

In [None]:
def train_test_plot(train, test):
    plt.figure(figsize=(16, 8))
    plt.plot(train, c='blue')
    plt.plot(test, c='orange');

In [None]:
# This plot confirms that our train test split makes sense
train_test_plot(train['close'], test['close'])

In [None]:
auto_arima(daily['close'].dropna(), seasonal=False).summary()

In [None]:
train

In [None]:
model = ARIMA(train['low'], order=(0,1,0))
results = model.fit()
results.summary()

In [None]:
predictions = results.predict(start=start, end=end, dynamic=False, typ='levels').rename('ARIMA-0-1-0 Predictions')

In [None]:
predictions.values

In [None]:
type(predictions)

In [None]:
def justified(row):
    
    if row['signal'] == -1 and row['low'] <= row['target_price']:
        return 1
    elif row['signal'] == 1 and row['high'] >= row['target_price']:
        return 1    
    else:
        return 0

In [None]:
outcomes = pd.DataFrame()
outcomes['low'] = test['low']
outcomes['high'] = test['high']

outcomes['preds'] = predictions.values
outcomes['target_price'] = test['target_price']
# outcomes['direction'] = test['signal']
outcomes['signal_match'] = test.apply(justified, axis=1)

#daily_pre['target_price'] = daily_pre.apply(choose_exit_price, axis=1)
# outcomes.append(predictions, ignore_index=True)
outcomes

In [None]:
# predictions['date']  = test.index
#predictions.reset_index(test.index)

In [None]:
# predictions.reindex(test.index)

In [None]:
type(predictions)

In [None]:
test.head()['close'].isnull().sum()

In [None]:
train.head()['close'].isnull().sum()

In [None]:
outcomes['low'].plot(legend=True, figsize=(12,8))
outcomes['preds'].plot(legend=True);
outcomes['target_price'].plot(legend=True);

# predictions.plot(legend=True)

In [None]:

error = mean_squared_error(test['close'], predictions)
print(f'ARIMA(0,1,0) MSE Error: {error:11.10}')


error = rmse(test['close'], predictions)
print(f'ARIMA(0,1,0) RMSE Error: {error:11.10}')

In [None]:
results = {'algo':'','name':'','date':'', 'time_frame':'','success':'','RMSE':'', 'MSE':'', 'classification':'' }


In [None]:
daily.columns

---

# SARIMAX


In [None]:
# daily = daily.resample('B').agg({'open':'first','high':'max',
#                                         'low':'min', 'close':'last'})

In [None]:
daily.index

In [None]:
daily['close'].dropna(inplace=True)

In [None]:
result = seasonal_decompose(daily['close'], model='add', period=400 )
result.plot();

In [None]:
%%time
auto_arima(daily['close'], seasonal=True, maxiter=10000).summary()

In [None]:
model = SARIMAX(train['close'], order=(0,1,0), seasonal_order=(1,0,1,12))

In [None]:
len(train)

In [None]:
train.columns

In [None]:
# Starting MSE and (P, D, Q).
mse = 99 * (10 ** 16)
final_P = 0
final_D = 0
final_Q = 0

for P in range(3):
    for Q in range(3):
        for D in range(3):
            try:
                # Instantiate SARIMA model.
                sarima = SARIMAX(endog = train['close'],
                                 order = (0, 1, 0),              # (p, d, q)
                                 seasonal_order = (P, D, Q, 12)) # (P, D, Q, S)

                # Fit SARIMA model.
                model = sarima.fit()

                # Generate predictions based on training set.
                # Start at time period 0 and end at 1028.
                preds = model.predict(start=0, end=1028)

                # Evaluate predictions.
                print(f'The MSE for (1, 0, 0)x({P},{D},{Q},12) is: {mean_squared_error(train["close"], preds)}')
                
                # Save for final report.
                if mse > mean_squared_error(train['close'], preds):
                    mse = mean_squared_error(train['close'], preds)
                    final_P = P
                    final_D = D
                    final_Q = Q
                
            except:
                print(f"p: {P}, D: {D}, Q: {Q}")
                pass

print(f'Our model that minimizes MSE on the training data is the SARIMA(1, 0, 0)x({final_P},{final_D},{final_Q},420).')
print(f'This model has an MSE of {mse}.')