### Construct the rolling windows for evaluation

This code relates to the paper "Measuring Financial Time Series Similarity With a View to Identifying Profitable Stock Market Opportunities" which was published in the proceedings of the International Conference on Case Based Reasoning (ICCBR) 2021

For queries please email rian.dolphin@ucdconnect.ie

In [None]:
#-- Imports
import yfinance as yf
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
#from scipy.special import softmax
import time

import datetime as dt
import random

import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

#-- Uncomment these for white Theme
my_template = 'plotly_white'
background_color = 'rgba(255,255,255,1)'

#-- Dark Theme
#my_template = 'plotly_dark'
#background_color = 'rgba(0,0,0,1)'

#-- Dark Transparent background
#my_template = 'plotly_dark'
#background_color = 'rgba(0,0,0,0)'

In [2]:
#-- Define function to sort the train_df by date
def sort_date(df):
    df.start_date = pd.to_datetime(df.start_date, unit='ms')
    df.end_date = pd.to_datetime(df.end_date, unit='ms')
    df.sort_values(by=['start_date'], inplace=True)
    return df

#-- Read in data we saved from previous notebook
train_df = pd.read_json('train_df_2021.json')

train_df = sort_date(train_df)

In [3]:
train_df.head()

Unnamed: 0,start_date,end_date,ticker,historic_returns,next_month,next_year
0,2005-01-31,2006-01-31,AMD,"[-0.262027076, 0.1044304147, -0.07621775730000...",0.366013,-0.49183
27000,2005-01-31,2006-01-31,INF.L,"[0.0860914485, 0.025610507100000002, 0.0011904...",0.00461,0.314445
14580,2005-01-31,2006-01-31,BN.PA,"[0.0362345556, 0.0531448587, 0.019920048200000...",0.015854,0.418722
19440,2005-01-31,2006-01-31,STJ.L,"[0.0483505767, 0.0888595028, 0.0191947502, -0....",0.108208,0.640438
7020,2005-01-31,2006-01-31,BIIB,"[-0.0200633853, -0.405018441, -0.1071152145, 0...",-0.011705,0.06758


In [9]:
train_df.ticker.value_counts()

STJ.L     180
SVT.L     180
ILMN      180
SWKS      180
NTES      180
         ... 
INCY      180
TXN       180
PSON.L    180
OR.PA     180
CS.PA     180
Name: ticker, Length: 160, dtype: int64

In [10]:
train_df.shape

(28800, 6)

### New Similarity Measure

In [11]:
#-- Add cumulative returns for each case to the datframe
#- This is the e() function in the paper
train_df['historic_cumulative'] = np.array([(np.array(xi)+1).cumprod()[-1] for xi in train_df.historic_returns])-1

In [8]:
train_df.head()

Unnamed: 0,start_date,end_date,ticker,historic_returns,next_month,next_year,historic_cumulative
0,2010-01-31,2010-07-31,PAYX,"[-0.049346250800000005, 0.032769975300000004, ...",0.012424,0.145575,-0.139811
14592,2010-01-31,2010-07-31,HIK.L,"[0.0600194955, 0.004566132200000001, 0.1490905...",0.003504,-0.03222,0.390939
14706,2010-01-31,2010-07-31,STAN.L,"[-0.1003718126, 0.0757578091, 0.1708137993, -0...",0.122486,0.01047,0.034439
14820,2010-01-31,2010-07-31,FRES.L,"[-0.1606248267, 0.1176467414, 0.1299133741, -0...",0.052578,0.831115,0.240727
14934,2010-01-31,2010-07-31,SDR.L,"[-0.0777199903, -0.0401285188, 0.1944935022, -...",0.06183,0.37326,-0.088353


### Get rolling dfs and test new similarity

In [9]:
np.random.seed(42)
sub_df = train_df.iloc[np.random.randint(0,len(train_df),1000)]
sub_df.head()

Unnamed: 0,start_date,end_date,ticker,historic_returns,next_month,next_year,historic_cumulative
15630,2016-07-31,2017-07-31,JMAT.L,"[0.1695824988, 0.018925496700000002, -0.013181...",-0.021247,0.339621,0.050774
15664,2010-05-31,2011-05-31,SLA.L,"[-0.0930001932, -0.038037147300000004, 0.15873...",-0.07165,-0.038532,0.203406
8234,2012-03-31,2013-03-31,EBAY,"[0.0324567236, 0.1116529279, -0.04461223, 0.07...",-0.008956,0.009687,0.530778
13283,2018-12-31,2019-12-31,VOD.L,"[-0.0949449343, -0.0936559607, -0.0310291735, ...",-0.043036,-0.043036,-0.035893
10103,2014-12-31,2015-12-31,CS.PA,"[-0.010816275, 0.0859151224, 0.0879884934, 0.0...",-0.0123,-0.013308,0.37479


In [12]:
def add_to_df(train_df, test_df, verbose=False):
    train_df = train_df.copy()
    test_df = test_df.copy()
    #-- Set top-k
    #k=20
    similarity = []
    correlations = []
    quantdares = []
    cumprod_diffs = []
    shape_measures = []
    next_month = []
    next_year = []
    idx_vals = []
    #bottom_correlations = []
    #bottom_next_month = []
    #bottom_next_year = []
    #bottom_idx_vals = []
    
    start = time.time()
    
    for test_case_idx in range(len(test_df)):#-- Remove TQDM
        if verbose:
            if test_case_idx%int(len(test_df)/4)==0:
                if test_case_idx==0:
                    print('Starting...')
                else:
                    end = time.time()
                    pct = int(test_case_idx/len(test_df) * 100)
                    print(f'{pct}% Complete')
                    print(f'{round(((end-start)/pct * 100 - (end-start))/60,2)} Estimated Minutes Remaining')
        
        temp_similarity = []
        temp_correlations = []
        temp_quantdare = []
        temp_shape = []
        temp_cumprod_diff = []
        temp_next_month = []
        temp_next_year = []
        temp_index = []
        
        max_itr = min(50000,len(train_df))
        train_case_idx = 0
        
        diff_mean = test_df.historic_cumulative.iloc[test_case_idx] - train_df.historic_cumulative.iloc[train_case_idx]
        
        while train_case_idx<max_itr:
        #for train_case_idx in range(len(train_df))
            #if abs(test_df.historic_cumulative.iloc[test_case_idx] - train_df.historic_cumulative.iloc[train_case_idx]) < 0.2:
            
            cumprod_diff = abs(test_df.historic_cumulative.iloc[test_case_idx] - train_df.historic_cumulative.iloc[train_case_idx])
            #correlation = np.corrcoef(test_df.historic_returns.iloc[test_case_idx], train_df.historic_returns.iloc[train_case_idx])[0,1]
            # NOT WITH PRICES - correlation = np.corrcoef((1+np.array(test_df.historic_returns.iloc[test_case_idx])).cumprod(), (1+np.array(train_df.historic_returns.iloc[train_case_idx])).cumprod())[0,1]
            temp_test_historic = np.array(test_df.historic_returns.iloc[test_case_idx])
            temp_train_historic = np.array(train_df.historic_returns.iloc[train_case_idx])
            quantdare = sum(temp_test_historic*temp_train_historic)/np.sqrt(sum(temp_test_historic**2)*sum(temp_train_historic**2))
            correlation = np.corrcoef(temp_test_historic, temp_train_historic)[0,1]
            shape_measure = ((temp_test_historic * temp_train_historic)>0).sum()
            
            temp_similarity.append(correlation+1/(1+cumprod_diff))
            temp_correlations.append(correlation)
            temp_quantdare.append(quantdare)
            temp_shape.append(shape_measure)
            temp_cumprod_diff.append(cumprod_diff)
            temp_next_month.append(train_df.next_month.iloc[train_case_idx])
            temp_next_year.append(train_df.next_year.iloc[train_case_idx])

            temp_index.append(train_df.index[train_case_idx])

            train_case_idx += 1
        #-- Sort while keeping the relation
        temp_index, temp_similarity, temp_correlations, temp_cumprod_diff, temp_next_month, temp_next_year, temp_shape, temp_quantdare = zip(*sorted(zip(temp_index, temp_similarity, temp_correlations, temp_cumprod_diff, temp_next_month, temp_next_year, temp_shape, temp_quantdare), reverse=False))
        
        #-- Add relevant metrics in full
        #######temp_correlations, temp_next_month, temp_next_year = temp_correlations[0:k], temp_next_month[0:k], temp_next_year[0:k]
        similarity.append(temp_similarity)
        correlations.append(temp_correlations)
        quantdares.append(temp_quantdare)
        cumprod_diffs.append(temp_cumprod_diff)
        shape_measures.append(temp_shape)
        next_month.append(temp_next_month)
        next_year.append(temp_next_year)
        idx_vals.append(temp_index)
        #-- Take bottom k
        ######temp_correlations, temp_next_month, temp_next_year = temp_correlations[-k:], temp_next_month[-k:], temp_next_year[-k:]
        #bottom_similarity.append(temp_similarity[-k:])
        #bottom_next_month.append(temp_next_month[-k:])
        #bottom_next_year.append(temp_next_year[-k:])
        #bottom_idx_vals.append(temp_index[-k:])

    test_df['equal_weight_similarity'] = similarity
    test_df['correlations'] = correlations
    test_df['quantdare'] = quantdares
    test_df['cumprod_diffs'] = cumprod_diffs
    test_df['shape_measure'] = shape_measures
    test_df['sim_next_month'] = next_month
    test_df['sim_next_year'] = next_year
    test_df['sim_idx_val'] = idx_vals
    
    #test_df['bottom_similarity'] = bottom_similarity
    #test_df['bottom_sim_next_month'] = bottom_next_month
    #test_df['bottom_sim_next_year'] = bottom_next_year
    #test_df['bottom_idx_vals'] = bottom_idx_vals
    
    #test_df['']
    #print("100% COMPLETE")
    return test_df


In [8]:
#-- Store the list of unique dates
dates = sorted(list(train_df.start_date.value_counts().index))

In [13]:
#-- Creat the rolling windows as described in the paper for evaluation
#- Each rolling windown gets it's own dataframe
def create_rolling_windows(train_df):

    train_df = sort_date(train_df)

    dates = sorted(list(train_df.start_date.value_counts().index))
    #print(len(dates))


    """
    Create multiple dataframes using a rolling window approach

    """

    rolling_dfs = []
    window_length = 5 #-- Originally 5
    test_window_length = 2
    steps=1 #-- 1 unless weekly
    for i in tqdm(range(0, len(dates)-window_length-test_window_length-1,steps)):
        rolling_dfs.append(add_to_df(train_df[train_df.start_date.isin(dates[i:window_length+1+i])], train_df[train_df.start_date.isin(dates[window_length+1+i:window_length+1+i+test_window_length])]))
        
    return rolling_dfs

In [14]:
rolling_dfs = create_rolling_windows(train_df)

100%|██████████| 172/172 [1:38:58<00:00, 34.53s/it]


In [15]:
#-- Save all the rolling_dfs
#- First create the folder in your directory
#- Then this code will populate the directory with lots of json files (one for each window)
#- This results in a lot of data (>30GB for the full set)
count=1
for df in rolling_dfs:
    file_name='Windows_2021_Short/window_'+str(count)+'.json'
    df.to_json(file_name)
    count+=1

In [15]:
#-- Example of one of the rolling DFs
rolling_dfs[4].head(2)

Unnamed: 0,start_date,end_date,ticker,historic_returns,next_month,next_year,historic_cumulative,equal_weight_similarity,correlations,quantdare,cumprod_diffs,shape_measure,sim_next_month,sim_next_year,sim_idx_val
18250,2010-11-30,2011-05-31,SKG.L,"[-0.1151358098, 0.0672516755, 0.2041091848, 0....",-0.071313,-0.439261,0.197283,"(1.005674212834682, 0.9514033225488387, 1.4538...","(0.2176136211789839, 0.09978106382694618, 0.47...","(0.14775439754476177, 0.1252444882427777, 0.56...","(0.2689379605938953, 0.17422952460259955, 0.02...","(4, 5, 4, 6, 5, 4, 5, 3, 5, 4, 4, 4, 4, 5, 4, ...","(0.028839292000000002, 0.0830412685, 0.0451964...","(0.09267688860000001, 0.09853257750000001, 0.0...","(4, 5, 6, 7, 8, 9, 118, 119, 120, 121, 122, 12..."
8104,2010-11-30,2011-05-31,SBUX,"[0.0759719351, 0.0499995493, -0.0186740009, 0....",0.019928,0.543227,0.277981,"(0.8898822254820029, 0.8100239155224968, 0.305...","(0.14894142404838934, 0.013164848271920987, -0...","(0.009872671405590434, 0.07030648419829964, -0...","(0.3496354878353902, 0.2549270518440945, 0.060...","(1, 4, 3, 3, 4, 5, 4, 4, 2, 3, 5, 3, 1, 4, 3, ...","(0.028839292000000002, 0.0830412685, 0.0451964...","(0.09267688860000001, 0.09853257750000001, 0.0...","(4, 5, 6, 7, 8, 9, 118, 119, 120, 121, 122, 12..."
