In [1]:
# install packages and libraries
%matplotlib inline
%matplotlib widget

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#plt.rcParams["figure.figsize"] = (11, 5)  #set default figure size

plt.rcParams["figure.figsize"] = (16, 9) #set default figure size (w, h) 
plt.style.use("ggplot")
import numpy.matlib
import copy
import scipy.sparse as sparse
from numpy.random import default_rng

In [2]:
import sys # importing sys
  
# adding Latest_scripts to the system path
sys.path.insert(0, '../Latest_scripts/')
sys.path.insert(0, '../../../ExternalHP_Codes/SteveMorse/hawkes-master/')

import HP_scripts as HP # import module containing functions for the Masters project
import MHP as MHP # import module containing EM functions from Steve Morse for the Masters project

# Load previous sesssion

In [3]:
import dill
dill.load_session('DJIA_univHP_env_September_version2.db')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Load the data discussed and analysed in https://methods.sagepub.com/dataset/howtoguide/multivariate-hawkes-in-djia-2018#i100

In [3]:
# Read data
# Import the data as a dataframe (2D data structure with labelled axes)

df = pd.read_csv('financial_data/dataset-djia-2018-subset2.csv')
dates = pd.to_datetime(df['Date']) # set dates as the Date of closing price column


del df['Date'] # delete Date column

# Fill missing values
df.ffill(inplace=True)
df

Unnamed: 0,AABA,AAPL,AMZN,AXP,BA,CAT,CSCO,CVX,DIS,GE,...,MSFT,NKE,PFE,PG,TRV,UNH,UTX,VZ,WMT,XOM
0,40.91,10.68,47.58,52.58,70.44,57.80,17.45,59.08,24.40,35.37,...,26.84,10.74,23.78,58.78,45.99,61.73,56.53,30.38,46.23,58.47
1,40.97,10.71,47.25,51.95,71.17,59.27,17.85,58.91,23.99,35.32,...,26.97,10.69,24.55,58.89,46.50,61.88,56.19,31.27,46.32,58.57
2,41.53,10.63,47.65,52.50,70.33,59.27,18.35,58.19,24.41,35.23,...,26.99,10.76,24.58,58.70,46.95,61.69,55.98,31.63,45.69,58.28
3,43.21,10.90,47.87,52.68,69.35,60.45,18.77,59.25,24.74,35.47,...,26.91,10.72,24.85,58.64,47.21,62.90,56.16,31.35,45.88,59.43
4,43.42,10.86,47.08,53.99,68.77,61.55,19.06,58.95,25.00,35.38,...,26.86,10.88,24.85,59.08,47.23,61.40,56.80,31.48,45.71,59.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3015,71.58,175.01,1168.36,98.74,295.10,155.75,38.55,124.98,108.67,17.50,...,85.51,63.29,36.14,92.13,134.39,220.00,127.23,53.19,98.21,83.97
3016,69.86,170.57,1176.76,98.57,295.36,156.44,38.48,125.98,108.12,17.43,...,85.40,63.65,36.21,92.48,134.78,219.60,127.14,53.22,99.16,83.98
3017,70.06,170.60,1182.26,99.13,295.62,157.52,38.56,125.55,107.64,17.38,...,85.71,62.95,36.33,92.10,134.77,220.42,127.58,53.28,99.26,83.90
3018,69.82,171.08,1186.10,99.70,296.35,158.42,38.59,125.58,107.77,17.36,...,85.72,62.95,36.37,92.07,135.66,222.77,128.12,53.43,99.40,84.02


In [4]:
# Google and amazon are not in DJIA list

ten_companies = ['AAPL','MSFT', 'JPM', 'GS', 'PFE', 'MRK','NKE', 'HD',  'GOOGL','AMZN']


ticker_id = [] # collect integer marker of each stock, index correspodns to stock AABA, 1 correponds to stock AAPL
ticker = []
ticker_dict = {} # collect key-value pairs where key represents tikcer of a stock and it's corresponding value an integer marker


for u,col in enumerate(df[ten_companies]):
    ticker_dict[col] = u
    ticker_id += [u]
    ticker += [col]

### Collect relevant data such as event times (largest $10 \%$ price drop, only include days were return was below 10% quantile), markers,...

In [78]:
# Collect event times

t_i=[]
u_i=[] # collect marker for each event times

ticker_id = [] # collect integer marker of each stock, index correspodns to stock AABA, 1 correponds to stock AAPL
ticker = []
ticker_dict = {} # collect key-value pairs where key represents tikcer of a stock and it's corresponding value an integer marker


for u,col in enumerate(df[ten_companies]):
    Tdiff = df[col].diff()
    timestamps=dates[Tdiff<=Tdiff.quantile(0.1)] # return timestamps where subsequent price difference was less than 10% quantile
    t_i.extend((timestamps - pd.Timestamp(2006,1,3)).dt.days.astype(float)) # measure time 
    u_i.extend(np.repeat(u,len(timestamps)))
    ticker_dict[col] = u
    ticker_id += [u]
    ticker += [col]

t_i = np.array(t_i)
#t_i=np.array(t_i)+np.random.rand(len(t_i))
u_i=np.array(u_i)
perm = np.argsort(t_i)
t_i = t_i[perm] # superposed timestamps
u_i = u_i[perm]

In [79]:
tstamps_ten_univ = []
tstamps_ten_univ_dates = []

count = 0
for i in ticker:
    tdiff_i = df[i].diff()
    tstamps_ten_univ_dates += [dates[tdiff_i<=tdiff_i.quantile(0.1)]]
    tstamps_i = (tstamps_ten_univ_dates[count] - pd.Timestamp(2006,1,3)).dt.days.astype(float) # change time units from days to months
    tstamps_ten_univ += [tstamps_i]
    count += 1

In [80]:
ts_i_with_dates = []
for i in range(len(ten_companies)):
    ts_i_with_dates += [pd.DataFrame({'Date': tstamps_ten_univ_dates[i].values, 'event time': tstamps_ten_univ[i].values})]
    #ts_i_with_dates[i].to_csv('ten_stocks_event_times/'+ten_companies[i]+'_ts_with_dates.csv', index=None)

In [83]:
avg1 = 0
avg2 = 0

for i in ticker:
    total_events = t_i[np.where(u_i == ticker_dict[i])].shape[0]
    print(f'number of events of stock '+i+':', total_events)
    print(f'average time between each event of \'type\' '+i+':', t_i[np.where(u_i == ticker_dict[i])][-1]/total_events)
    avg1 += total_events
    avg2 += t_i[np.where(u_i == ticker_dict[i])][-1]/total_events
print('')
print(f'Total number of total events for 10 companies: ',avg1)

print(f'Average number of total events for 10 companies: ',avg1/10)
print(f'Average time between each event for 10 companies: ',avg2/10)

number of events of stock AAPL: 303
average time between each event of 'type' AAPL: 14.448844884488448
number of events of stock MSFT: 305
average time between each event of 'type' MSFT: 14.321311475409836
number of events of stock JPM: 302
average time between each event of 'type' JPM: 14.443708609271523
number of events of stock GS: 303
average time between each event of 'type' GS: 14.415841584158416
number of events of stock PFE: 303
average time between each event of 'type' PFE: 14.41914191419142
number of events of stock MRK: 304
average time between each event of 'type' MRK: 14.351973684210526
number of events of stock NKE: 302
average time between each event of 'type' NKE: 14.490066225165563
number of events of stock HD: 302
average time between each event of 'type' HD: 14.437086092715232
number of events of stock GOOGL: 302
average time between each event of 'type' GOOGL: 14.413907284768213
number of events of stock AMZN: 302
average time between each event of 'type' AMZN: 14.4

### Reshape timestamps and marker data for largest $10 \%$ price-jumps into the form of Steve Morse's EM code, and convert unit of time from days to months, i.e. scale each event times by 30

In [11]:
data = []
counter = 0
for t,u in zip(t_i, u_i):
    data.append([])
    data[counter] += [t/30, np.array([u], dtype='int')]
    counter += 1


data = np.array(data)

In [12]:
data

array([[0.23333333333333334, array([4])],
       [0.4666666666666667, array([4])],
       [0.5, array([8])],
       ...,
       [145.86666666666667, array([6])],
       [145.93333333333334, array([0])],
       [145.93333333333334, array([9])]], dtype=object)

### Collect relevant data such as event times (largest 15% price drop, only include days were return was below 15% quantile), markers,...

In [65]:
# Collect event times

t_i=[]
u_i=[] # collect marker for each event times

ticker_id = [] # collect integer marker of each stock, index correspodns to stock AABA, 1 correponds to stock AAPL
ticker = []
ticker_dict = {} # collect key-value pairs where key represents tikcer of a stock and it's corresponding value an integer marker


for u,col in enumerate(df[ten_companies]):
    Tdiff = df[col].diff()
    timestamps=dates[Tdiff<=Tdiff.quantile(0.15)] # return timestamps where subsequent price difference was less than 10% quantile
    t_i.extend((timestamps - pd.Timestamp(2006,1,3)).dt.days.astype(float)) # measure time 
    u_i.extend(np.repeat(u,len(timestamps)))
    ticker_dict[col] = u
    ticker_id += [u]
    ticker += [col]

t_i = np.array(t_i)
#t_i=np.array(t_i)+np.random.rand(len(t_i))
u_i=np.array(u_i)
perm = np.argsort(t_i)
t_i = t_i[perm] # superposed timestamps
u_i = u_i[perm]

In [66]:
tstamps_ten_univ = []
tstamps_ten_univ_dates = []

count = 0
for i in ticker:
    tdiff_i = df[i].diff()
    tstamps_ten_univ_dates += [dates[tdiff_i<=tdiff_i.quantile(0.15)]]
    tstamps_i = (tstamps_ten_univ_dates[count] - pd.Timestamp(2006,1,3)).dt.days.astype(float) # change time units from days to months
    tstamps_ten_univ += [tstamps_i]
    count += 1

In [67]:
ts_i_with_dates = []
for i in range(len(ten_companies)):
    ts_i_with_dates += [pd.DataFrame({'Date': tstamps_ten_univ_dates[i].values, 'event time': tstamps_ten_univ[i].values})]
    #ts_i_with_dates[i].to_csv('ten_stocks_event_times/'+ten_companies[i]+'_ts_with_dates.csv', index=None)

In [68]:
print(ts_i_with_dates[0]['event time'])

0       380.0
1       567.0
2       574.0
3       583.0
4       602.0
        ...  
453    4353.0
454    4361.0
455    4368.0
456    4375.0
457    4378.0
Name: event time, Length: 458, dtype: float64


In [71]:
avg1 = 0
avg2 = 0

for i in ticker:
    total_events = t_i[np.where(u_i == ticker_dict[i])].shape[0]
    print(f'number of events of stock '+i+':', total_events)
    print(f'average time between each event of \'type\' '+i+':', t_i[np.where(u_i == ticker_dict[i])][-1]/total_events)
    avg1 += total_events
    avg2 += t_i[np.where(u_i == ticker_dict[i])][-1]/total_events
print('')
print(f'Total number of total events for 10 companies: ',avg1)

print(f'Average number of total events for 10 companies: ',avg1/10)
print(f'Average time between each event for 10 companies: ',avg2/10)

number of events of stock AAPL: 458
average time between each event of 'type' AAPL: 9.558951965065502
number of events of stock MSFT: 455
average time between each event of 'type' MSFT: 9.6
number of events of stock JPM: 453
average time between each event of 'type' JPM: 9.664459161147903
number of events of stock GS: 454
average time between each event of 'type' GS: 9.62114537444934
number of events of stock PFE: 453
average time between each event of 'type' PFE: 9.64459161147903
number of events of stock MRK: 453
average time between each event of 'type' MRK: 9.631346578366445
number of events of stock NKE: 455
average time between each event of 'type' NKE: 9.621978021978022
number of events of stock HD: 453
average time between each event of 'type' HD: 9.631346578366445
number of events of stock GOOGL: 453
average time between each event of 'type' GOOGL: 9.660044150110375
number of events of stock AMZN: 453
average time between each event of 'type' AMZN: 9.664459161147903

Total num

### Collect relevant data such as event times (largest 20% price drop, only include days were return was below 20% quantile), markers,...

In [72]:
# Collect event times

t_i=[]
u_i=[] # collect marker for each event times

ticker_id = [] # collect integer marker of each stock, index correspodns to stock AABA, 1 correponds to stock AAPL
ticker = []
ticker_dict = {} # collect key-value pairs where key represents tikcer of a stock and it's corresponding value an integer marker


for u,col in enumerate(df[ten_companies]):
    Tdiff = df[col].diff()
    timestamps=dates[Tdiff<=Tdiff.quantile(0.2)] # return timestamps where subsequent price difference was less than 10% quantile
    t_i.extend((timestamps - pd.Timestamp(2006,1,3)).dt.days.astype(float)) # measure time 
    u_i.extend(np.repeat(u,len(timestamps)))
    ticker_dict[col] = u
    ticker_id += [u]
    ticker += [col]

t_i = np.array(t_i)
#t_i=np.array(t_i)+np.random.rand(len(t_i))
u_i=np.array(u_i)
perm = np.argsort(t_i)
t_i = t_i[perm] # superposed timestamps
u_i = u_i[perm]

In [73]:
tstamps_ten_univ = []
tstamps_ten_univ_dates = []

count = 0
for i in ticker:
    tdiff_i = df[i].diff()
    tstamps_ten_univ_dates += [dates[tdiff_i<=tdiff_i.quantile(0.2)]]
    tstamps_i = (tstamps_ten_univ_dates[count] - pd.Timestamp(2006,1,3)).dt.days.astype(float) # change time units from days to months
    tstamps_ten_univ += [tstamps_i]
    count += 1

In [74]:
ts_i_with_dates = []
for i in range(len(ten_companies)):
    ts_i_with_dates += [pd.DataFrame({'Date': tstamps_ten_univ_dates[i].values, 'event time': tstamps_ten_univ[i].values})]
    #ts_i_with_dates[i].to_csv('ten_stocks_event_times/'+ten_companies[i]+'_ts_with_dates.csv', index=None)

In [75]:
print(ts_i_with_dates[0]['event time'])

0        34.0
1        37.0
2       380.0
3       420.0
4       524.0
        ...  
608    4355.0
609    4361.0
610    4368.0
611    4375.0
612    4378.0
Name: event time, Length: 613, dtype: float64


In [77]:
avg1 = 0
avg2 = 0

for i in ticker:
    total_events = t_i[np.where(u_i == ticker_dict[i])].shape[0]
    print(f'number of events of stock '+i+':', total_events)
    print(f'average time between each event of \'type\' '+i+':', t_i[np.where(u_i == ticker_dict[i])][-1]/total_events)
    avg1 += total_events
    avg2 += t_i[np.where(u_i == ticker_dict[i])][-1]/total_events
print('')

print(f'Total number of total events for 10 companies: ',avg1)

print(f'Average number of total events for 10 companies: ',avg1/10)
print(f'Average time between each event for 10 companies: ',avg2/10)

number of events of stock AAPL: 613
average time between each event of 'type' AAPL: 7.141924959216966
number of events of stock MSFT: 604
average time between each event of 'type' MSFT: 7.231788079470198
number of events of stock JPM: 613
average time between each event of 'type' JPM: 7.141924959216966
number of events of stock GS: 605
average time between each event of 'type' GS: 7.224793388429752
number of events of stock PFE: 610
average time between each event of 'type' PFE: 7.163934426229508
number of events of stock MRK: 604
average time between each event of 'type' MRK: 7.223509933774834
number of events of stock NKE: 611
average time between each event of 'type' NKE: 7.1653027823240585
number of events of stock HD: 606
average time between each event of 'type' HD: 7.1996699669967
number of events of stock GOOGL: 605
average time between each event of 'type' GOOGL: 7.234710743801653
number of events of stock AMZN: 604
average time between each event of 'type' AMZN: 7.24834437086

# I havent saved data where time is measured in months instead of days

In [106]:
pd.DataFrame(data, columns=['event time','node']).to_csv('multivariate_ts_data.csv', index=None)

In [98]:
node = 0
ts_col = [np.array([])]
for i in range(len(ticker)):
    ts_col = np.append(ts_col, data[np.where(data[:,1] == node)][:,0])
    node += 1


In [134]:
node = 0
ts_col = []
for i in range(len(ticker)):
    ts_col += [data[np.where(data[:,1] == node)][:,0].flatten()]
    node += 1

In [135]:
ts_col = np.array(ts_col)

In [142]:
print(ticker)

['AAPL', 'MSFT', 'JPM', 'GS', 'PFE', 'MRK', 'NKE', 'HD', 'GOOGL', 'AMZN']


In [145]:
pd.DataFrame(data[np.where(data[:,1] == 4)][:,0], columns=['event time']).to_csv('Pfizer_ts_data.csv', index=False)

In [8]:
HP.plot_event_times(data, num_of_nodes=10,  Time_horizon=-999, company_ticker=ticker, xaxislabel=r't, number of days since January 3, 2006', show_time_periods=False, labeled=True)
# plt.show()
plt.ion()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[ 0 -1 -2 -3 -4 -5 -6 -7 -8 -9]
['AAPL', 'MSFT', 'JPM', 'GS', 'PFE', 'MRK', 'NKE', 'HD', 'GOOGL', 'AMZN']


### 20% drop

In [10]:
# Collect event times

t_i20=[]
u_i20=[] # collect marker for each event times

ticker_id = [] # collect integer marker of each stock, index correspodns to stock AABA, 1 correponds to stock AAPL
ticker = []
ticker_dict = {} # collect key-value pairs where key represents tikcer of a stock and it's corresponding value an integer marker


for u,col in enumerate(df[ten_companies]):
    Tdiff = np.log(df[col]).diff()
    timestamps=dates[Tdiff<=Tdiff.quantile(0.2)] # return timestamps where subsequent price difference was less than 0.2
    t_i20.extend((timestamps - pd.Timestamp(2006,1,3)).dt.days.astype(float))
    u_i20.extend(np.repeat(u,len(timestamps)))
    ticker_dict[col] = u
    ticker_id += [u]
    ticker += [col]


t_i20=np.array(t_i20)+np.random.rand(len(t_i20))
u_i20=np.array(u_i20)
perm = np.argsort(t_i20)
t_i20 = t_i20[perm] # superposed timestamps
u_i20 = u_i20[perm]



for i in ticker:
    total_events = t_i20[np.where(u_i20 == ticker_dict[i])].shape[0]
    print(f'number of events of stock '+i+':', total_events)
    avg = t_i20[np.where(u_i20 == ticker_dict[i])][-1]/total_events
    print(f'average time between each event of \'type\' '+i+':')
    print(r'%.3f days' % avg)

number of events of stock AAPL: 604
average time between each event of 'type' AAPL:
7.245 days
number of events of stock MSFT: 604
average time between each event of 'type' MSFT:
7.207 days
number of events of stock JPM: 604
average time between each event of 'type' JPM:
7.223 days
number of events of stock GS: 604
average time between each event of 'type' GS:
7.233 days
number of events of stock PFE: 604
average time between each event of 'type' PFE:
7.234 days
number of events of stock MRK: 604
average time between each event of 'type' MRK:
7.225 days
number of events of stock NKE: 604
average time between each event of 'type' NKE:
7.246 days
number of events of stock HD: 604
average time between each event of 'type' HD:
7.211 days
number of events of stock GOOGL: 604
average time between each event of 'type' GOOGL:
7.208 days
number of events of stock AMZN: 604
average time between each event of 'type' AMZN:
7.249 days


### 90% price jump

In [11]:
# Collect event times

t_i90=[]
u_i90=[] # collect marker for each event times

ticker_id = [] # collect integer marker of each stock, index correspodns to stock AABA, 1 correponds to stock AAPL
ticker = []
ticker_dict = {} # collect key-value pairs where key represents tikcer of a stock and it's corresponding value an integer marker


for u,col in enumerate(df[ten_companies]):
    Tdiff = (df[col]).diff()
    timestamps=dates[Tdiff>=Tdiff.quantile(0.9)] # return timestamps where subsequent price difference was less than 0.2
    t_i90.extend((timestamps - pd.Timestamp(2006,1,3)).dt.days.astype(float))
    u_i90.extend(np.repeat(u,len(timestamps)))
    ticker_dict[col] = u
    ticker_id += [u]
    ticker += [col]


t_i90=np.array(t_i90)+np.random.rand(len(t_i90))
u_i90=np.array(u_i90)
perm = np.argsort(t_i90)
t_i90 = t_i90[perm] # superposed timestamps
u_i90 = u_i90[perm]

for i in ticker:
    total_events = t_i90[np.where(u_i90 == ticker_dict[i])].shape[0]
    print(f'number of events of stock '+i+':', total_events)
    print(f'average time between each event of \'type\' '+i+':', t_i90[np.where(u_i90 == ticker_dict[i])][-1]/total_events)

number of events of stock AAPL: 303
average time between each event of 'type' AAPL: 14.414080194724823
number of events of stock MSFT: 303
average time between each event of 'type' MSFT: 14.403819918473534
number of events of stock JPM: 304
average time between each event of 'type' JPM: 14.375039087571116
number of events of stock GS: 302
average time between each event of 'type' GS: 14.471115837880753
number of events of stock PFE: 306
average time between each event of 'type' PFE: 14.262688718931082
number of events of stock MRK: 303
average time between each event of 'type' MRK: 14.39595108595655
number of events of stock NKE: 304
average time between each event of 'type' NKE: 14.376841098160119
number of events of stock HD: 302
average time between each event of 'type' HD: 14.489696643966676
number of events of stock GOOGL: 302
average time between each event of 'type' GOOGL: 14.461547275605175
number of events of stock AMZN: 302
average time between each event of 'type' AMZN: 14.4

## Use Kolmogorov-Smirnov test to check if the 10 univariates follow homogenous Poisson process

5          7.0
9         14.0
22        31.0
27        38.0
38        56.0
         ...  
2969    4306.0
2975    4314.0
2977    4318.0
3002    4354.0
3013    4369.0
Name: Date, Length: 303, dtype: float64

In [12]:
for u,col in enumerate(df[ten_companies]):
    Tdiff = df[col].diff()
    timestamps=dates[Tdiff<=Tdiff.quantile(0.1)] # return timestamps where subsequent price difference was less than 10% quantile
    t_i.extend((timestamps - pd.Timestamp(2006,1,3)).dt.days.astype(float)) # measure time 
    u_i.extend(np.repeat(u,len(timestamps)))
    ticker_dict[col] = u
    ticker_id += [u]
    ticker += [col]

AttributeError: 'numpy.ndarray' object has no attribute 'extend'

In [10]:
w = np.linspace(1,12,100)

In [15]:
P = MHP.MHP() # instantiate MHP object
P.generate_seq(60) # mu=[0.1], alpha=[[0.5]], and omega=1.0

#P.data = data[np.where(data[:,1] == 0)]
#P.data[:,1] = 0
#print(data[np.where(data[:,1] == 3)])
P.mu = [0.1]
P.alpha = [0.3]
P.omega = 0.5


# P.plot_events()
# plt.show()

seed = 10

rng = np.random.default_rng(seed)
ahat = rng.uniform(0,1, size=(1,1))
mhat = rng.uniform(0,1, size=11)



#w = np.linspace(0.1,4.25,100)
ahat, mhat

Max eigenvalue: 0.50000


(array([[0.95600171]]),
 array([0.20768181, 0.82844489, 0.14928212, 0.51280462, 0.1359196 ,
        0.68903648, 0.84174772, 0.425509  , 0.956926  , 0.82533291,
        0.33821531]))

In [16]:
ahat_arr = []; mhat_arr = []; LL_arr = []; res = []

In [13]:
u_i = np.linspace(0,0,np.shape(tstamps_PFE)[0])

In [14]:
data = []
counter = 0
for t,u in zip(tstamps_PFE, u_i):
    data.append([])
    data[counter] += [t, np.array([u], dtype='int')]
    counter += 1

In [15]:
data = np.array(data)

In [155]:
for w_hyperparam in range(len(w)):
    #print(f'hyperparameter omega: ', w[i])
    P.data = data[0:159,:]
    P.data[:,1] = 0
    res += [P.EM(np.array([ahat[0]]), np.array([mhat[0]]), w[w_hyperparam], verbose=False, seed1=None,seed2=None)]
    ahat_arr += [res[w_hyperparam][0]] 
    mhat_arr += [res[w_hyperparam][1]]
    LL_arr += [-res[w_hyperparam][-1]]

In [156]:
min_index = np.argmin(LL_arr)
omega_index = [min_index]
mhat_vals = mhat_arr[min_index]
ahat_vals = ahat_arr[min_index]

In [157]:
w[omega_index], mhat_vals, ahat_vals

(array([3.11111111]), array([0.07395349]), array([[5.00075732e-17]]))

In [166]:
plt.plot(w,LL_arr)
plt.show()
plt.ion()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [13]:
seed = 100

rng = np.random.default_rng(seed)
ahat = rng.uniform(0,1, size=(1,1))
mhat = rng.uniform(0,1, size=1)



#w = np.linspace(0.1,4.25,100)
ahat, mhat

(array([[0.83498163]]), array([0.59655403]))

In [22]:
mu_val, alpha_tilde_val, beta_val, NLL = HP.EM(mhat, ahat, 3, data[:,0], Maxiter=10, num_of_tstamp = -1)

ValueError: shapes (10,10) and (1,4578103) not aligned: 10 (dim 1) != 1 (dim 0)

In [None]:
mu_val, alpha_tilde_val/beta_val, beta_val, NLL

# Pfizer data outlier investigation

In [94]:
df2 = pd.read_csv('dataset-djia-2018-subset2.csv')
dates = pd.to_datetime(df2['Date']) # set dates as the Date of closing price column

In [96]:
df2[['PFE','MRK']].plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x24638151f48>

In [98]:
dates = pd.to_datetime(df2['Date']) # set dates as the Date of closing price column
df2.reset_index().plot(x='Date', y=['PFE','MRK'])
#plt.yscale('log')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x246398df908>

In [None]:
t_i = []
u_i = []

In [114]:
df2 = df2.set_index('Date')

In [134]:
tdiff_PFE = df['PFE'].diff()
ts_PFE = dates[tdiff_PFE<=tdiff_PFE.quantile(0.1)]
tstamps_PFE = (ts_PFE - pd.Timestamp(2006,1,3)).dt.days.astype(float)

In [None]:
ts_PFE_date = pd.DataFrame({'Date': ts_PFE.values, 'event time': tstamps_PFE.values})

# Appled data investigation

In [13]:
tdiff_AAPL = df['AAPL'].diff()
ts_AAPL = dates[tdiff_AAPL<=tdiff_AAPL.quantile(0.1)]
tstamps_AAPL = (ts_AAPL - pd.Timestamp(2006,1,3)).dt.days.astype(float)

In [15]:
AAPL_event_times = pd.DataFrame({'event time': tstamps_AAPL.values})
AAPL_event_times

Unnamed: 0,event time
0,567.0
1,574.0
2,583.0
3,610.0
4,674.0
...,...
298,4348.0
299,4353.0
300,4368.0
301,4375.0


In [16]:
AAPL_event_times.to_csv('AAPL_event_times.dat', header=None, index=None)

# Save event times with dates data of each stock

In [146]:
tstamps_ten_univ = []
tstamps_ten_univ_dates = []

count = 0
for i in ticker:
    tdiff_i = df[i].diff()
    tstamps_ten_univ_dates += [dates[tdiff_i<=tdiff_i.quantile(0.1)]]
    tstamps_i = (tstamps_ten_univ_dates[count] - pd.Timestamp(2006,1,3)).dt.days.astype(float)
    tstamps_ten_univ += [tstamps_i]
    count += 1

In [147]:
ts_i_with_dates = []
for i in range(len(ticker)):
    ts_i_with_dates += [pd.DataFrame({'Date': tstamps_ten_univ_dates[i].values, 'event time': tstamps_ten_univ[i].values})]
    ts_i_with_dates[i].to_csv('ten_stocks_event_times/'+ticker[i]+'_ts_with_dates.csv', index=None)

In [103]:
ts_MRK.iloc[0:100]

21    2006-02-02
65    2006-04-06
93    2006-05-17
95    2006-05-19
101   2006-05-30
         ...    
719   2008-11-10
721   2008-11-12
723   2008-11-14
724   2008-11-17
726   2008-11-19
Name: Date, Length: 100, dtype: datetime64[ns]

In [104]:
tstamps_MRK.iloc[0:100]

21       30.0
65       93.0
93      134.0
95      136.0
101     147.0
        ...  
719    1042.0
721    1044.0
723    1046.0
724    1049.0
726    1051.0
Name: Date, Length: 100, dtype: float64

In [71]:
ts_PFE.iloc[0:160]

5      2006-01-10
9      2006-01-17
22     2006-02-03
27     2006-02-10
38     2006-02-28
          ...    
1468   2011-10-31
1475   2011-11-09
1483   2011-11-21
1485   2011-11-23
1661   2012-08-07
Name: Date, Length: 160, dtype: datetime64[ns]

In [72]:
tstamps_PFE.iloc[0:160]

5          7.0
9         14.0
22        31.0
27        38.0
38        56.0
         ...  
1468    2127.0
1475    2136.0
1483    2148.0
1485    2150.0
1661    2408.0
Name: Date, Length: 160, dtype: float64

5          7.0
9         14.0
22        31.0
27        38.0
38        56.0
         ...  
1464    2121.0
1468    2127.0
1475    2136.0
1483    2148.0
1485    2150.0
Name: Date, Length: 159, dtype: float64

In [47]:
tstamps_PFE_subset = np.array(tstamps_PFE, dtype=float)[0:159]

In [48]:
tstamps_PFE_subset

array([   7.,   14.,   31.,   38.,   56.,   62.,   84.,   94.,  127.,
        134.,  135.,  147.,  167.,  189.,  192.,  211.,  230.,  289.,
        294.,  301.,  309.,  310.,  335.,  385.,  387.,  420.,  434.,
        475.,  492.,  519.,  520.,  533.,  535.,  561.,  569.,  570.,
        583.,  610.,  612.,  625.,  651.,  654.,  667.,  673.,  685.,
        687.,  692.,  707.,  710.,  731.,  742.,  744.,  745.,  752.,
        763.,  764.,  766.,  793.,  799.,  801.,  806.,  826.,  829.,
        835.,  840.,  855.,  885.,  890.,  899.,  905.,  916.,  937.,
        975.,  980.,  986.,  988.,  993., 1000., 1008., 1009., 1010.,
       1011., 1016., 1023., 1025., 1030., 1037., 1038., 1044., 1046.,
       1049., 1051., 1052., 1063., 1066., 1071., 1099., 1107., 1119.,
       1121., 1123., 1128., 1134., 1147., 1149., 1150., 1151., 1154.,
       1168., 1171., 1203., 1205., 1219., 1240., 1245., 1259., 1276.,
       1302., 1359., 1387., 1396., 1434., 1479., 1486., 1492., 1493.,
       1575., 1584.,

In [25]:
from scipy import stats

In [35]:
delta_t = stats.expon.rvs(size=10000)
t = np.cumsum(delta_t)
stats.kstest(t/t.max(), 'uniform')

KstestResult(statistic=0.006935085022182208, pvalue=0.7192655672067916)

In [38]:
plt.hist(t/t.max())
plt.show()
plt.ion()

In [51]:
delta_t = np.diff(tstamps_PFE_check) # waiting times between events
#delta_t = np.diff(data[:,0])
t = np.cumsum(delta_t) # cumulative sum of the waiting times
sample = t/t.max()
sample = np.array(sample, dtype=float)
stats.kstest(sample, 'uniform')

KstestResult(statistic=0.09407708354208988, pvalue=0.00889706899897158)

In [33]:
plt.hist(sample)

(array([26., 39., 52., 16., 25.,  9., 32., 23., 53., 27.]),
 array([0.00160477, 0.10144429, 0.20128381, 0.30112334, 0.40096286,
        0.50080238, 0.60064191, 0.70048143, 0.80032095, 0.90016048,
        1.        ]),
 <a list of 10 Patch objects>)

In [31]:
stats_res = []
counter = 0
print("Kolmogorov-Smirnov test to check if the 10 univariates follow homogenous Poisson process")

for i in ticker:
    delta_t = np.diff(t_i[np.where(u_i == ticker_dict[i])])
    t = np.cumsum(delta_t) # cumulative sum of the waiting times
    sample = t/t.max()
    sample = np.array(sample, dtype=float)
    stats_res += [stats.kstest(sample, 'uniform')]
    print("")
    print("K-S test result of "+i+':')
    print(f'p-value =',stats_res[counter][1])
    counter += 1

Kolmogorov-Smirnov test to check if the 10 univariates follow homogenous Poisson process

K-S test result of AAPL:
p-value = 1.8236325477265985e-16

K-S test result of MSFT:
p-value = 3.629881683931352e-05

K-S test result of JPM:
p-value = 4.479376905440927e-08

K-S test result of GS:
p-value = 1.5646723764133746e-16

K-S test result of PFE:
p-value = 0.00893297940973777

K-S test result of MRK:
p-value = 0.0019848736369328587

K-S test result of NKE:
p-value = 1.2099883981575186e-24

K-S test result of HD:
p-value = 1.6547317663727973e-21

K-S test result of GOOGL:
p-value = 5.2510035312893816e-17

K-S test result of AMZN:
p-value = 8.327460140257444e-35


In [21]:
for i in ticker:
    total_events = t_i20[np.where(u_i20 == ticker_dict[i])].shape[0]
    print(f'number of events of stock '+i+':', total_events)
    avg = t_i20[np.where(u_i20 == ticker_dict[i])][-1]/total_events
    print(f'average time between each event of \'type\' '+i+':')
    print(r'%.3f days' % avg)

number of events of stock AAPL: 604
average time between each event of 'type' AAPL:
7.245 days
number of events of stock MSFT: 604
average time between each event of 'type' MSFT:
7.208 days
number of events of stock JPM: 604
average time between each event of 'type' JPM:
7.223 days
number of events of stock GS: 604
average time between each event of 'type' GS:
7.232 days
number of events of stock PFE: 604
average time between each event of 'type' PFE:
7.234 days
number of events of stock MRK: 604
average time between each event of 'type' MRK:
7.225 days
number of events of stock NKE: 604
average time between each event of 'type' NKE:
7.247 days
number of events of stock HD: 604
average time between each event of 'type' HD:
7.211 days
number of events of stock GOOGL: 604
average time between each event of 'type' GOOGL:
7.208 days
number of events of stock AMZN: 604
average time between each event of 'type' AMZN:
7.248 days


In [35]:
t_i[np.where(u_i == ticker_dict[i])].shape

(302,)

### Use EM to estimate alpha and mu given timestamps of univariate HP that models the events of each company using initial guess of alpha and mu 

### Compute log-likelihoods of each selected stock by tuning hyperparameter omega to find omega that gives the highest value of log-likelihood using EM

In [22]:
P = MHP.MHP() # instantiate MHP object
P.generate_seq(60) # mu=[0.1], alpha=[[0.5]], and omega=1.0

#P.data = data[np.where(data[:,1] == 0)]
#P.data[:,1] = 0
#print(data[np.where(data[:,1] == 3)])
P.mu = [0.1]
P.alpha = [0.3]
P.omega = 0.5


# P.plot_events()
# plt.show()

seed = 1

rng = np.random.default_rng(seed)
ahat = rng.uniform(0,1, size=(10,10))
mhat = rng.uniform(0,1, size=10)



#w = np.linspace(0.1,4.25,100)
ahat, mhat

Max eigenvalue: 0.50000


(array([[0.51182162, 0.9504637 , 0.14415961, 0.94864945, 0.31183145,
         0.42332645, 0.82770259, 0.40919914, 0.54959369, 0.02755911],
        [0.75351311, 0.53814331, 0.32973172, 0.7884287 , 0.30319483,
         0.45349789, 0.1340417 , 0.40311299, 0.20345524, 0.26231334],
        [0.75036467, 0.28040876, 0.48519097, 0.9807372 , 0.96165719,
         0.72478994, 0.54122686, 0.2768912 , 0.16065201, 0.96992541],
        [0.51606859, 0.11586561, 0.62348976, 0.77668311, 0.6130033 ,
         0.9172977 , 0.03959288, 0.52858926, 0.45933588, 0.06234958],
        [0.64132817, 0.85263284, 0.59294102, 0.26009745, 0.83988152,
         0.50949588, 0.51088888, 0.75303021, 0.14792204, 0.81962672],
        [0.68328691, 0.78709694, 0.19161626, 0.80236416, 0.19132393,
         0.08155262, 0.85522697, 0.8612835 , 0.8765371 , 0.47190972],
        [0.27404839, 0.00709183, 0.6457209 , 0.71990938, 0.83556922,
         0.28187783, 0.21521817, 0.63933138, 0.80505483, 0.96367087],
        [0.15052483, 0.4822

In [20]:
w = np.linspace(0.05,4,100) # hyperparameter values chosen for sufficient coverage
w

array([0.05      , 0.08989899, 0.12979798, 0.16969697, 0.20959596,
       0.24949495, 0.28939394, 0.32929293, 0.36919192, 0.40909091,
       0.4489899 , 0.48888889, 0.52878788, 0.56868687, 0.60858586,
       0.64848485, 0.68838384, 0.72828283, 0.76818182, 0.80808081,
       0.8479798 , 0.88787879, 0.92777778, 0.96767677, 1.00757576,
       1.04747475, 1.08737374, 1.12727273, 1.16717172, 1.20707071,
       1.2469697 , 1.28686869, 1.32676768, 1.36666667, 1.40656566,
       1.44646465, 1.48636364, 1.52626263, 1.56616162, 1.60606061,
       1.6459596 , 1.68585859, 1.72575758, 1.76565657, 1.80555556,
       1.84545455, 1.88535354, 1.92525253, 1.96515152, 2.00505051,
       2.04494949, 2.08484848, 2.12474747, 2.16464646, 2.20454545,
       2.24444444, 2.28434343, 2.32424242, 2.36414141, 2.4040404 ,
       2.44393939, 2.48383838, 2.52373737, 2.56363636, 2.60353535,
       2.64343434, 2.68333333, 2.72323232, 2.76313131, 2.8030303 ,
       2.84292929, 2.88282828, 2.92272727, 2.96262626, 3.00252

# Hyperparameter optimisation

### Vary hyperparmaeter $\omega$ to compute log-likelihoods using EM algorithm and find the "best" parametric estimate of $\omega$, i.e., $\omega$ with least value of log-likelihood

In [23]:
omega_index = [] # store index of best value of omega 
ahat_vals = [] 
mhat_vals = []

ahat_arr = []; mhat_arr = []
LL_arr = []
# axes are in a two-dimensional array, indexed by [row, col]
fig, axs = plt.subplots(2, 5)

node = 0
for i in range(2):
    axs[i, 0].set_ylabel('Negative log-likelihood')

    for j in range(5):
        
        ahat_arr += [[]]; mhat_arr += [[]]
        LL_arr += [[]]; res = []
        for w_hyperparam in range(len(w)):
            #print(f'hyperparameter omega: ', w[i])
            P.data = data[np.where(data[:,1] == node)]
            P.data[:,1] = 0
            res += [P.EM(np.array([ahat[node]]), np.array([mhat[node]]), w[w_hyperparam], verbose=False, seed1=99,seed2=100)]
            ahat_arr[node] += [res[w_hyperparam][0]] 
            mhat_arr[node] += [res[w_hyperparam][1]]
            LL_arr[node] += [-res[w_hyperparam][-1]]
            #print(estimates[i])
            #print('\n')
        
        
        
#         while abs(LL_arr[count] - LL_arr[-1]) >= epsilon:
#             count += 1
#         print(abs(LL_arr[count] - LL_arr[-1]))
        min_index = np.argmin(LL_arr[node])
        omega_index += [min_index]
        mhat_vals += [mhat_arr[node][min_index]]
        ahat_vals += [ahat_arr[node][min_index]]
        
        ax = axs[i, j]
        ax.plot(w, LL_arr[node])
        ax.text(0.7, 0.25, ten_companies[node],transform=ax.transAxes,
            fontsize=18,verticalalignment ='bottom',
            horizontalalignment ='right', color ='green',)
        #axs[i, j].axvline(x=w[count], ymin=LL_arr[count], ymax=LL_arr[0], label=r'$\hat{\omega} = $'+str(w[count]), c='b', linestyle='dashed')
        
        ax.scatter(x=w[min_index],y=LL_arr[node][min_index], c='b',s=80,marker='X',linewidths=1.5)
        #ax.plot(w[count],LL_arr[count],color='blue',label=r'$\hat{\omega} = $'+str(w[count]),linewidth=2, markersize=50)

        node += 1
        print('Working')

        #ax[i, j].text(0.5, 0.5, str((i, j)), fontsize=18, ha='center')
for j in range(5):
    axs[1, j].set_xlabel(r'omega, $\omega$')

plt.tight_layout()
#plt.savefig('LL_fig_iter6.png')
plt.show()
plt.ion()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Working
Working
Working
Working
Working
Working
Working
Working
Working
Working


In [24]:
omega_index

[18, 17, 14, 15, 17, 26, 15, 16, 23, 16]

In [52]:
for i in mhat_vals:
    print(r'mu: %.3f' % i[0])

mu: 0.268
mu: 0.712
mu: 0.477
mu: 0.520
mu: 0.863
mu: 0.857
mu: 0.343
mu: 0.439
mu: 0.692
mu: 0.219


In [53]:
for i in ahat_vals:
    print(r'alpha: %.3f' % i[0])

alpha: 0.871
alpha: 0.660
alpha: 0.771
alpha: 0.750
alpha: 0.585
alpha: 0.590
alpha: 0.834
alpha: 0.789
alpha: 0.668
alpha: 0.894


In [54]:
with np.printoptions(precision=3, suppress=True):
    print(w[omega_index])

[0.768 0.728 0.609 0.648 0.728 1.087 0.648 0.688 0.968 0.688]


In [None]:
squad

In [16]:
sqad 

[array([0.0692]), array([0.0698]), array([0.0692]), array([0.0694]), array([0.0693]), array([0.0697]), array([0.069]), array([0.0693]), array([0.0694]), array([0.069])]


### Estimated parameter values:

In [13]:
omega_index, omega_index1

NameError: name 'omega_index1' is not defined

In [24]:
with np.printoptions(precision=3, suppress=True):
    print(omega_index)


[98, 84, 67, 48, 55, 87, 99, 67, 48, 99]


In [None]:
ahat_vals, ahat_vals1

In [21]:
with np.printoptions(precision=4, suppress=True):
    print(mhat_vals)
    print(mhat_vals1)

[array([0.0692]), array([0.0698]), array([0.0692]), array([0.0694]), array([0.0693]), array([0.069]), array([0.0693]), array([0.0691]), array([0.0694]), array([0.069])]
[array([0.0692]), array([0.0698]), array([0.0692]), array([0.0694]), array([0.0693]), array([0.069]), array([0.0693]), array([0.0691]), array([0.0694]), array([0.069])]


In [17]:
ahat_vals, ahat_vals1

([array([[1.7809185e-16]]),
  array([[1.42325103e-16]]),
  array([[5.5695148e-17]]),
  array([[9.97180025e-17]]),
  array([[1.101146e-16]]),
  array([[1.44286708e-16]]),
  array([[2.65169285e-09]]),
  array([[2.53042991e-17]]),
  array([[2.96737679e-17]]),
  array([[4.57304226e-14]])],
 [array([[1.7809185e-16]]),
  array([[1.42325103e-16]]),
  array([[5.5695148e-17]]),
  array([[9.97180025e-17]]),
  array([[1.101146e-16]]),
  array([[1.44286708e-16]]),
  array([[2.65169285e-09]]),
  array([[2.53042991e-17]]),
  array([[2.96737679e-17]]),
  array([[4.57304226e-14]])])

In [None]:
B

In [None]:
HP.plot_event_times(data, num_of_nodes=10,  Time_horizon=-999, company_ticker=ticker, xaxislabel=r't, number of days since January 3, 2006', show_time_periods=False, labeled=True)

In [None]:
seed = 99
ts = np.array([])
for i in range(2):
    np.append(ts, HP.simulate_timestamps_till_horizon(mhat_vals[i], ahat_vals[i], w[i] , Thorizon = np.amax(data[:,0]), seed=seed, node=i, output_rejected_data=False))
ts
    

In [None]:
ts1 = HP.simulate_timestamps_till_horizon(mhat_vals[0], ahat_vals[0], w[0] , Thorizon = np.amax(data[:,0]), seed=None, output_rejected_data=False)
ts2 = HP.simulate_timestamps_till_horizon(mhat_vals[1], ahat_vals[0], w[1] , Thorizon = np.amax(data[:,0]), seed=None, output_rejected_data=False)
ts3 = HP.simulate_timestamps_till_horizon(mhat_vals[2], ahat_vals[2], w[2] , Thorizon = np.amax(data[:,0]), seed=None, output_rejected_data=False)
ts4 = HP.simulate_timestamps_till_horizon(mhat_vals[3], ahat_vals[3], w[3] , Thorizon = np.amax(data[:,0]), seed=None, output_rejected_data=False)
ts5 = HP.simulate_timestamps_till_horizon(mhat_vals[4], ahat_vals[4], w[4] , Thorizon = np.amax(data[:,0]), seed=None, output_rejected_data=False)

#a_hat_1 = 0.00882517; mhat_vals_1 = 0.06859064; w_1 = 1.5848484848484852

plt.plot(ts1,[0]*len(ts1), marker='o')
plt.plot(ts2,[-1]*len(ts2), marker='o')
plt.plot(ts3,[-2]*len(ts3), marker='o')
plt.plot(ts4,[-3]*len(ts4), marker='o')
plt.plot(ts5,[-4]*len(ts5), marker='o')

In [None]:
B

# Save this session and load it back

In [55]:
import dill
dill.dump_session('DJIA_univHP_env_September_version2.db')

In [4]:
#dill.load_session('DJIA_univHP_env_September_version2.db')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …