In [None]:
import pandas as pd
import numpy as np
import time
from helper import *
from datetime import timedelta, date, datetime

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt

colors = ["#FF0000", "#18E3FF", "#FFA90C", "#D9E501", "#0A2DC2", "#05A41B", "#878787", "#935948", "#b372fa", "#FC12DD"]
bench_colors = ["#000000", "#18E3FF", "#FFA90C", "#05A41B"]

sns.set_context("notebook")
plt.rcParams["figure.figsize"] = (16, 8)

In [None]:
df = get_data('1d')
df_close = get_close_list(df)
df_ratio = convert_prices(df_close)

## Split Dataset into Train and Test Periods

Training Dataset consists of `379 days (9096 hours)`. 

Validation and Test Datasets consists of `28 day (672 hours)` period. 
First Trading Period is the **Validation Period** used for parameter tuning. Next 6 months is split into 6 additional 4-week trading periods.
Training window is held of constant size but moved by 28 days for each consecutive trading window.


In [None]:
start_train = datetime.strptime("11.05.2021", "%d.%m.%Y") 
start_date = datetime.strptime("25.05.2022", "%d.%m.%Y")
iterations = []
for i in range(7):
    end_date = start_date + timedelta(days=27, hours=23, minutes=59, seconds=59)
    end_train = start_train + timedelta(days=378, hours=23, minutes=59, seconds=59)
    iterations.append({'train_start': start_train, 'train_end': end_train, 'test_start': start_date, 'test_end': end_date})
    # print(f"{i:02} | train: {start_train} - {end_train}  | trade: start: {start_date} end: {end_date}")
    start_date = start_date + timedelta(days=28)
    start_train = start_train + timedelta(days=28)
iterations = pd.DataFrame(iterations)
iterations.iloc[6]['test_end'] = '2022-12-01 23:59:59'
iterations

In [None]:
iterations.to_csv('dataset_split.csv')

## Crypto Dataset for Thesis

```
Start          : 2021-05-11
End            : 2022-12-01
Total          : 570 days (13680 hours)

Train Set      : 9096 hours (379 days)
Start          : 2021-05-11
End            : 2022-05-24

Validation set : 672 hours (28 days)
Start          : 2022-05-25
End            : 2022-06-21

Trading set:
Start          : 2022-06-22
End            : 2022-12-01 (163 days - 5x28 + 1x23)

ratio total data
train            : 0.665
test + val total : 0.335

```

## Original Dataset from Paper

```
Train Start: 2009-01-01
Train End  : 2015-09-30
Days       : 1692 trading days (2463 total)

Validation Start: 2015-10-01
Validation End  : 2015-12-31
Days (US)       : 61 trading days (91 total)

Trading windows: 2016-01-01
End            : 2020-03-30 (roughly)
days           : 1071           

17 trading windows, each 63 days 

total days for training: 1753
days training + 1 trading: 1816
ratio trading vs train : 0.03469
```

In [None]:
def plot_performance(df, title='Period 1', log_scale=False):
    ratios = convert_prices(get_close_list(df))
    ratios = ratios.cumprod()
    datex = ratios.index

    fig = make_subplots(specs=[[{"secondary_y": True}]])
    for idx, ticker in enumerate(ratios.columns):
        fig.add_trace(go.Scatter(x=datex,y=ratios[ticker],name=ticker, marker_color=colors[idx]),secondary_y=False)
       
    fig.update_layout(title_text=f'{title} Market Performances', height=600)
    log_str = ''
    if log_scale:
        fig.update_yaxes(type="log")    
        log_str = '_logscale'
    fig.show()
    fig.write_html(f"plots/{title.replace(' ','_')}{log_str}.html")
    fig.write_image(f"plots/{title.replace(' ','_')}{log_str}.png", width=1200)
    
def plot_correlation(df, filename=''):
    df_close = get_close_list(df)
    df_corr = df_close.corr()
    fig, axs = plt.subplots(ncols=2, gridspec_kw=dict(width_ratios=[6,0.5]), figsize=(9, 7 ))
    sns.heatmap(df_corr, annot=True, cbar=False, ax=axs[0], vmin=-1, vmax=1, square=True)
    fig.colorbar(axs[0].collections[0], cax=axs[1])
    fig.suptitle(filename, fontsize='large')
    if filename != '':
        fig.savefig(f"plots/{filename.replace(' ','_')}_correlations.png")        
    plt.show()
    
    

In [None]:
train_start = str(iterations.iloc[0]['train_start'])
train_end = str(iterations.iloc[0]['train_end'])
train_set = df[(df['date'] >= train_start) & (df['date'] <= train_end)]
set_name = f'Training Dataset'
plot_performance(train_set, set_name, log_scale=True)

In [None]:
plot_correlation(train_set, set_name)

# Test Dataset

In [None]:
complete_test_start = str(iterations.iloc[0]['test_start'])
complete_test_set = df[(df['date'] >= complete_test_start)]
set_name = f'Test Dataset (complete)'
plot_performance(complete_test_set, set_name, log_scale=False)

In [None]:
sns.set(font_scale=1)
plot_correlation(complete_test_set, set_name)

### Individual Test datasets

In [None]:
sns.set(font_scale=1)
items = iterations.shape[0]
for i in range(items):
    test_start = str(iterations.iloc[i]['test_start'])
    test_end = str(iterations.iloc[i]['test_end'])
    test_set = df[(df['date'] >= test_start) & (df['date'] <= test_end)]
    
    set_name = f'Test Dataset Period {i}'    
    plot_correlation(test_set, set_name)    

In [None]:
items = iterations.shape[0]
col_per_row = 3
rows = int(np.ceil(items / col_per_row))

# fig = plt.figure(constrained_layout=True, figsize=(12, 28 ))  ## for 3col plot
fig = plt.figure(constrained_layout=True, figsize=(12, 12 ))
subfigs = fig.subfigures(1, 2, wspace=0.01, width_ratios=[14, 1])

axsLeft = subfigs[0].subplots(rows, col_per_row)
# subfigs[0].set_facecolor('0.75')
sns.set(font_scale=0.6)

for i in range(items):
    row = int(np.floor(i / col_per_row))
    col = i % col_per_row
    ax = axsLeft[row][col]
    
    test_start = str(iterations.iloc[i]['test_start'])
    test_end = str(iterations.iloc[i]['test_end'])
    test_set = df[(df['date'] >= test_start) & (df['date'] <= test_end)]
    test_corr = get_close_list(test_set).corr()
    test_corr.columns = [x.replace('USDT', '') for x in list(test_corr.columns)]
    test_corr.index = [x.replace('USDT', '') for x in list(test_corr.index)]
    ax.set_title(f"Test Period {i}", fontsize=8)
    h = sns.heatmap(test_corr, annot=True, cbar=False, ax=ax, vmin=-1, vmax=1, square=True)
    h.set_yticklabels(h.get_yticklabels(), rotation = 0, fontsize = 7)
    h.set_xticklabels(h.get_xticklabels(), rotation = 90, fontsize = 7)

# hide empty subplots in grid
missing = col_per_row - (items % col_per_row)
for i in range(missing):
    axsLeft[rows-1, col_per_row-i-1].axis('off')
    # pass
# subfigs[0].colorbar(axsLeft[0][0].collections[0], cax=axsLeft[5][3])

axsRight = subfigs[1].subplots(1, 1, sharex=True)

# subfigs[1].set_facecolor('0.85')
subfigs[1].colorbar(axsLeft[0][0].collections[0], cax=axsRight)

fig.suptitle('Asset correlations per 28d test data set', fontsize='xx-large')

plt.show()
fig.savefig('plots/asset_correlations_28d.png')

In [None]:
# items = iterations.shape[0]
# col_per_row = 3
# rows = int(np.ceil(items / col_per_row))

# # fig = plt.figure(constrained_layout=True, figsize=(12, 28 ))  ## for 3col plot
# fig = plt.figure(constrained_layout=True, figsize=(9, 9 ))
# subfigs = fig.subfigures(1, 2, wspace=0.01, width_ratios=[15, 1])

# axsLeft = subfigs[0].subplots(rows, col_per_row)
# # subfigs[0].set_facecolor('0.75')
# sns.set(font_scale=0.6)



# for i in range(items):
#     row = int(np.floor(i / col_per_row))
#     col = i % col_per_row
#     ax = axsLeft[row][col]
#     print(ax)
#     test_start = str(iterations.iloc[i]['test_start'])
#     test_end = str(iterations.iloc[i]['test_end'])
#     test_set = df[(df['date'] >= test_start) & (df['date'] <= test_end)]
#     test_corr = get_close_list(test_set).corr()
#     test_corr.columns = [x.replace('USDT', '') for x in list(test_corr.columns)]
#     test_corr.index = [x.replace('USDT', '') for x in list(test_corr.index)]
#     ax.set_title(f"Test Period {i}", fontsize=8)
#     ax.tick_params(right=False, labelright=False, labelleft=True, labelrotation=0)
#     plt.xticks(rotation = 90)
#     plt.yticks(rotation= 90)
#     h = sns.heatmap(test_corr, annot=True, cbar=False, ax=ax, vmin=-1, vmax=1, square=True)
#     h.set_yticklabels(h.get_yticklabels(), rotation = 0, fontsize = 7)
#     h.set_xticklabels(h.get_xticklabels(), rotation = 90, fontsize = 7)

# # hide empty subplots in grid
# missing = col_per_row - (items % col_per_row)
# for i in range(missing):
#     axsLeft[rows-1, col_per_row-i-1].axis('off')
#     # pass
# # subfigs[0].colorbar(axsLeft[0][0].collections[0], cax=axsLeft[5][3])

# axsRight = subfigs[1].subplots(4, 1, sharex=True)
# # axsRightFirst = axsRight[0]
# # subfigs[1].set_facecolor('0.85')
# subfigs[1].colorbar(axsLeft[0][0].collections[0], cax=axsRight)

# fig.suptitle('Asset correlations per 28d test data set', fontsize='xx-large')
# # plt.xticks(rotation = 0)
# plt.show()
# # fig.savefig('plots/asset_correlations_28d.png')

In [None]:
for i in range(iterations.shape[0]):
    # train_start = str(iterations.iloc[i]['train_start'])
    # train_end = str(iterations.iloc[i]['train_end'])
    test_start = str(iterations.iloc[i]['test_start'])
    test_end = str(iterations.iloc[i]['test_end'])

    # train_set = df[(df['date'] >= train_start) & (df['date'] <= train_end)]
    test_set = df[(df['date'] >= test_start) & (df['date'] <= test_end)]
    set_name = f'Test Dataset Period {i}' # if i > 0 else 'Validation Dataset'
    plot_performance(test_set, set_name)

In [None]:
items = iterations.shape[0]
col_per_row = 3
rows = int(np.ceil(items / col_per_row))

# fig = plt.figure(constrained_layout=True, figsize=(12, 28 ))  ## for 3col plot
fig = plt.figure(constrained_layout=True, figsize=(12, 12 ))
subfigs = fig.subfigures(1, 2, wspace=0.01, width_ratios=[18, 1])

axsLeft = subfigs[0].subplots(rows, col_per_row)
# subfigs[0].set_facecolor('0.75')
sns.set(font_scale=0.6)

for i in range(items):
    row = int(np.floor(i / col_per_row))
    col = i % col_per_row
    ax = axsLeft[row][col]
    
    test_start = str(iterations.iloc[i]['test_start'])
    test_end = str(iterations.iloc[i]['test_end'])
    test_set = df[(df['date'] >= test_start) & (df['date'] <= test_end)]
    test_corr = get_close_list(test_set).corr()
    test_corr.columns = [x.replace('USDT', '') for x in list(test_corr.columns)]
    test_corr.index = [x.replace('USDT', '') for x in list(test_corr.index)]
    ax.set_title(f"Test Period {i}", fontsize=8)
    sns.heatmap(test_corr, annot=True, cbar=False, ax=ax, vmin=-1, vmax=1, square=True)

# hide empty subplots in grid
missing = col_per_row - (items % col_per_row)
for i in range(missing):
    axsLeft[rows-1, col_per_row-i-1].axis('off')
    # pass
# subfigs[0].colorbar(axsLeft[0][0].collections[0], cax=axsLeft[5][3])

axsRight = subfigs[1].subplots(1, 1, sharex=True)

# subfigs[1].set_facecolor('0.85')
subfigs[1].colorbar(axsLeft[0][0].collections[0], cax=axsRight)

fig.suptitle('Asset correlations per 28d test data set', fontsize='xx-large')

plt.show()
fig.savefig('plots/asset_correlations_28d.png')

In [None]:
# plot_correlation(train_set, set_name)
# plot_correlation(complete_test_set, set_name)

In [None]:
my_train_corr = get_close_list(train_set).corr()
my_test_corr = get_close_list(complete_test_set).corr()

fig = plt.figure(constrained_layout=True, figsize=(9, 4 ))
subfigs = fig.subfigures(1, 3, wspace=0.01, width_ratios=[8,1,8])

axsLeft = subfigs[0].subplots(1, 1)
axsLeft.set_title(f"Correlation Training Dataset", fontsize=8)
sns.heatmap(my_train_corr, annot=True, cbar=False, ax=axsLeft, vmin=-1, vmax=1, square=True)

axsMiddle = subfigs[1].subplots(1, 1, sharex=True)
subfigs[1].colorbar(axsLeft.collections[0], cax=axsMiddle)

axsRight = subfigs[2].subplots(1, 1)
axsRight.set_title(f"Correlation Test Dataset", fontsize=8)
sns.heatmap(my_test_corr, annot=True, cbar=False, ax=axsRight, vmin=-1, vmax=1, square=True)
axsRight.tick_params(right=True, labelright=True, labelleft=False, labelrotation=0)
plt.xticks(rotation = 90)

plt.show()
fig.savefig(f"plots/correlations_overview_all.png", dpi=300)    

In [None]:
train_set

In [None]:
train_close = get_close_list(train_set)
data = []
columns = ["first", "high", "low", "last", "peak", "max.dd","end" ]
data.append(train_close.iloc[0].tolist())
data.append(train_close.max().tolist())
data.append(train_close.min().tolist())
data.append(train_close.iloc[-1].tolist())
data.append((((train_close.max() / train_close.iloc[0]) -1) * 100).round().tolist())
data.append((((train_close.min() / train_close.iloc[0]) -1) * 100).round().tolist())
data.append((((train_close.iloc[-1] / train_close.iloc[0]) -1) * 100).round().tolist())
res = pd.DataFrame(data, columns=train_close.columns).T
res.columns = columns
print("training stats")
res

In [None]:
test_close = get_close_list(complete_test_set)
data = []
columns = ["first", "high", "low", "last", "peak", "max.dd","end" ]
data.append(test_close.iloc[0].tolist())
data.append(test_close.max().tolist())
data.append(test_close.min().tolist())
data.append(test_close.iloc[-1].tolist())
data.append((((test_close.max() / test_close.iloc[0]) -1) * 100).round().tolist())
data.append((((test_close.min() / test_close.iloc[0]) -1) * 100).round().tolist())
data.append((((test_close.iloc[-1] / test_close.iloc[0]) -1) * 100).round().tolist())
res = pd.DataFrame(data, columns=test_close.columns).T
res.columns = columns
print("validation stats")
res

In [None]:
complete_test_set

In [None]:
complete_test_set

In [None]:
complete_test_start = str(iterations.iloc[1]['test_start'])
complete_test_set = df[(df['date'] >= complete_test_start)]
set_name = f'Test Dataset (complete)'

In [None]:
iterations