In [1]:
import pandas as pd
import numpy as np
import re
import pathlib
import functools

In [2]:
df = pd.read_csv("../data/daily/AAPL.csv") 

In [3]:
df.columns

Index(['AAPL.Open', 'AAPL.High', 'AAPL.Low', 'AAPL.Close', 'AAPL.Volume',
       'AAPL.Adjusted', 'Date'],
      dtype='object')

In [4]:
def get_real_name(x):
  
  if '.' in x:
    return x.split('.')[1]
  else:
    return x


In [5]:
get_real_name('AAPL.Close')

'Close'

In [6]:
new_columns = [get_real_name(name) for name in df.columns.to_list()]
new_columns

['Open', 'High', 'Low', 'Close', 'Volume', 'Adjusted', 'Date']

In [7]:
df.columns = new_columns

In [8]:
df

Unnamed: 0,Open,High,Low,Close,Volume,Adjusted,Date
0,3.081786,3.092143,2.925000,2.992857,1.238320e+09,2.551165,2007-01-03
1,3.001786,3.069643,2.993571,3.059286,8.472604e+08,2.607790,2007-01-04
2,3.063214,3.078571,3.014286,3.037500,8.347416e+08,2.589219,2007-01-05
3,3.070000,3.090357,3.045714,3.052500,7.971068e+08,2.602005,2007-01-08
4,3.087500,3.320714,3.041071,3.306071,3.349298e+09,2.818155,2007-01-09
...,...,...,...,...,...,...,...
4031,126.010002,130.289993,124.889999,129.619995,8.768660e+07,129.619995,2023-01-06
4032,130.470001,133.410004,129.889999,130.149994,7.079080e+07,130.149994,2023-01-09
4033,130.259995,131.259995,128.119995,130.729996,6.389620e+07,130.729996,2023-01-10
4034,131.250000,133.509995,130.460007,133.490005,6.945890e+07,133.490005,2023-01-11


In [9]:
df[['Open', 'High', 'Low', 'Close', 'Volume']] = np.log(df[['Open', 'High', 'Low', 'Close', 'Volume']])

In [10]:
df['Mean'] = (df['Close'] + df['Open'])/2
df['Mean_diff'] = df['Mean'].diff()

In [11]:
n=20
alpha = 2/(n+1)
alpha

0.09523809523809523

In [12]:

df['Mean_diff_EMA'] = df['Mean_diff'].ewm(alpha=alpha).mean()

In [13]:
df['Mean_diff_EMS'] = df['Mean_diff'].ewm(alpha=alpha).std()

In [14]:
df

Unnamed: 0,Open,High,Low,Close,Volume,Adjusted,Date,Mean,Mean_diff,Mean_diff_EMA,Mean_diff_EMS
0,1.125509,1.128864,1.073294,1.096228,20.937021,2.551165,2007-01-03,1.110869,,,
1,1.099207,1.121561,1.096467,1.118182,20.557519,2.607790,2007-01-04,1.108695,-0.002174,-0.002174,
2,1.119465,1.124466,1.103363,1.111035,20.542633,2.589219,2007-01-05,1.115250,0.006555,0.002409,0.006173
3,1.121678,1.128287,1.113735,1.115961,20.496499,2.602005,2007-01-08,1.118819,0.003569,0.002835,0.004309
4,1.127362,1.200180,1.112210,1.195760,21.932017,2.818155,2007-01-09,1.161561,0.042742,0.014355,0.021204
...,...,...,...,...,...,...,...,...,...,...,...
4031,4.836361,4.869763,4.827433,4.864607,18.289280,129.619995,2023-01-06,4.850484,0.013642,-0.004122,0.014608
4032,4.871143,4.893427,4.866688,4.868688,18.075240,130.149994,2023-01-09,4.869915,0.019431,-0.001879,0.015601
4033,4.869532,4.877180,4.852967,4.873134,17.972770,130.729996,2023-01-10,4.871333,0.001418,-0.001565,0.014873
4034,4.877104,4.894176,4.871067,4.894027,18.056246,133.490005,2023-01-11,4.885565,0.014232,-0.000060,0.014926


In [39]:
df['Mean_diff_next'] = df['Mean_diff'].shift(-1)

In [40]:
df

Unnamed: 0,Open,High,Low,Close,Volume,Adjusted,Date,Mean,Mean_diff,Mean_diff_EMA,Mean_diff_EMS,Mean_diff_next
0,1.125509,1.128864,1.073294,1.096228,20.937021,2.551165,2007-01-03,1.110869,,,,-0.002174
1,1.099207,1.121561,1.096467,1.118182,20.557519,2.607790,2007-01-04,1.108695,-0.002174,-0.002174,,0.006555
2,1.119465,1.124466,1.103363,1.111035,20.542633,2.589219,2007-01-05,1.115250,0.006555,0.002409,0.006173,0.003569
3,1.121678,1.128287,1.113735,1.115961,20.496499,2.602005,2007-01-08,1.118819,0.003569,0.002835,0.004309,0.042742
4,1.127362,1.200180,1.112210,1.195760,21.932017,2.818155,2007-01-09,1.161561,0.042742,0.014355,0.021204,0.069211
...,...,...,...,...,...,...,...,...,...,...,...,...
4031,4.836361,4.869763,4.827433,4.864607,18.289280,129.619995,2023-01-06,4.850484,0.013642,-0.004122,0.014608,0.019431
4032,4.871143,4.893427,4.866688,4.868688,18.075240,130.149994,2023-01-09,4.869915,0.019431,-0.001879,0.015601,0.001418
4033,4.869532,4.877180,4.852967,4.873134,17.972770,130.729996,2023-01-10,4.871333,0.001418,-0.001565,0.014873,0.014232
4034,4.877104,4.894176,4.871067,4.894027,18.056246,133.490005,2023-01-11,4.885565,0.014232,-0.000060,0.014926,0.009620


In [43]:
df[['Mean_diff_EMA', 'Mean_diff_EMS', 'Mean_diff_next']].corr()

Unnamed: 0,Mean_diff_EMA,Mean_diff_EMS,Mean_diff_next
Mean_diff_EMA,1.0,-0.319356,0.07755
Mean_diff_EMS,-0.319356,1.0,-0.041035
Mean_diff_next,0.07755,-0.041035,1.0


In [44]:
df['Mean_diff_EMZ'] = df['Mean_diff_EMA'] / df['Mean_diff_EMS']

In [45]:
df[['Mean_diff_EMA', 'Mean_diff_EMS', 'Mean_diff_EMZ','Mean_diff_next']].corr()

Unnamed: 0,Mean_diff_EMA,Mean_diff_EMS,Mean_diff_EMZ,Mean_diff_next
Mean_diff_EMA,1.0,-0.319356,0.906473,0.07755
Mean_diff_EMS,-0.319356,1.0,-0.313918,-0.041035
Mean_diff_EMZ,0.906473,-0.313918,1.0,0.082061
Mean_diff_next,0.07755,-0.041035,0.082061,1.0


In [55]:
def get_df(name):
  df = pd.read_csv(name)
  new_columns = [get_real_name(name) for name in df.columns.to_list()]
  df.columns = new_columns
  # replace 0 to a very small value to avoid log(0)
  df[['Open', 'High', 'Low', 'Close', 'Volume']] = df[['Open', 'High', 'Low', 'Close', 'Volume']].replace(0, 0.00001)
  
  df[['Open', 'High', 'Low', 'Close', 'Volume']] = np.log(df[['Open', 'High', 'Low', 'Close', 'Volume']])
  df['Mean'] = (df['Close'] + df['Open'])/2
  df['Mean_diff'] = df['Mean'].diff()
  df['Close_diff'] = df['Close'].diff()
  df['Close_diff_next'] = df['Close_diff'].shift(-1)
  n=20
  alpha = 2/(n+1)
  alpha
  
  df['Mean_diff_EMA'] = df['Mean_diff'].ewm(alpha=alpha).mean()
  df['Mean_diff_EMS'] = df['Mean_diff'].ewm(alpha=alpha).std()
  df['Mean_diff_EMZ'] = df['Mean_diff_EMA'] / df['Mean_diff_EMS']
  return df

In [56]:
df = get_df("../data/daily/A.csv")

In [57]:
df[['Mean_diff_EMA', 'Mean_diff_EMS', 'Mean_diff_EMZ','Close_diff_next']].corr()

Unnamed: 0,Mean_diff_EMA,Mean_diff_EMS,Mean_diff_EMZ,Close_diff_next
Mean_diff_EMA,1.0,-0.331432,0.858342,-0.029859
Mean_diff_EMS,-0.331432,1.0,-0.269178,-0.019753
Mean_diff_EMZ,0.858342,-0.269178,1.0,-0.014131
Close_diff_next,-0.029859,-0.019753,-0.014131,1.0


In [58]:
def concat(df1, df2):
  df = pd.concat([df1, df2])
  return df

In [59]:
dir = "../data/daily/"
p = pathlib.Path(dir).glob('*.csv')
files = [x for x in p if x.is_file()]
# test first 30 stocks
files = files[:30]

df = None
for file in files:
  df1 = get_df(file)
  df = df1 if df is None else pd.concat([df, df1])
df


Unnamed: 0,Open,High,Low,Close,Volume,Adjusted,Date,Mean,Mean_diff,Close_diff,Close_diff_next,Mean_diff_EMA,Mean_diff_EMS,Mean_diff_EMZ
0,3.575151,3.583519,3.563316,3.572627,14.663779,15.677454,2007-01-03,3.573889,,,-0.000843,,,
1,3.556776,3.574310,3.555348,3.571784,14.814560,15.664246,2007-01-04,3.564280,-0.009609,-0.000843,-0.011306,-0.009609,,
2,3.572065,3.572065,3.550766,3.560478,14.706319,15.488138,2007-01-05,3.566271,0.001992,-0.011306,-0.006559,-0.003519,0.008203,-0.428962
3,3.560762,3.560762,3.550766,3.553918,14.384794,15.386887,2007-01-08,3.557340,-0.008931,-0.006559,0.005137,-0.005506,0.006497,-0.847538
4,3.555062,3.561614,3.551053,3.559055,14.280062,15.466128,2007-01-09,3.557059,-0.000281,0.005137,0.021682,-0.003998,0.005849,-0.683472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4031,4.250636,4.270816,4.249495,4.266756,14.906158,71.290001,2023-01-06,4.258696,0.016231,0.029177,0.009632,0.000898,0.010117,0.088761
4032,4.262398,4.283449,4.261834,4.276388,14.811647,71.980003,2023-01-09,4.269393,0.010697,0.009632,0.001804,0.001831,0.010066,0.181934
4033,4.274024,4.279302,4.269278,4.278193,14.923248,72.110001,2023-01-10,4.276108,0.006715,0.001804,0.009248,0.002296,0.009687,0.237069
4034,4.280409,4.290048,4.280271,4.287441,14.874585,72.779999,2023-01-11,4.283925,0.007817,0.009248,-0.011470,0.002822,0.009363,0.301429


In [None]:
df[['Mean_diff_EMA', 'Mean_diff_EMS', 'Mean_diff_EMZ','Close_diff_next']].corr()

In [None]:
df[(df['Mean_diff_EMZ'] > -3) & (df['Mean_diff_EMZ'] < 3)]['Mean_diff_EMZ'].hist(bins=100)

In [None]:
df[df['Mean_diff_EMZ']<-1][['Mean_diff_EMA', 'Mean_diff_EMS', 'Mean_diff_EMZ','Close_diff_next']].corr()

In [None]:
df

In [None]:
import plotly
import plotly.express as px

In [None]:
df_p = df[(df['Mean_diff_EMZ'] > -3) & (df['Mean_diff_EMZ'] < 3)]
fig = px.scatter(x=df_p['Mean_diff_EMZ'], y=df_p['Close_diff_next'])
fig.show()

In [None]:
df[df['Mean_diff_EMZ']>-3][['Mean_diff_EMZ', 'Close_diff_next']].reset_index(drop=True).plot()

In [None]:
df[df['Mean_diff_EMZ']<-1.2]

In [None]:
# calculate the rank:
# merge the table with its date
def merge_2_df(df1, df2):
  return df1.merge(df2, on='Date', how='outer')

In [None]:
dir = "../data/daily/"
p = pathlib.Path(dir).glob('*.csv')
files = [x for x in p if x.is_file()][:10]
df = None
for file in files:
  stock_name = str(file).split('/')[-1].split('.')[0]
  df1 = get_df(file)
  df1.columns = [stock_name + '.' + x if x != 'Date' else x for x in df1.columns]
  df = df1 if df is None else df.merge(df1, on='Date', how='outer')
df

In [None]:
emz_columns = df.columns[['EMZ' in x for x in df.columns]]

In [None]:
emz_columns

In [None]:
df[emz_columns]

In [None]:
df[emz_columns].apply(lambda x: , axis='columns')

In [None]:
s = [2, 3, 1, 4, 5, 3]
s1 = sorted(range(len(s)), key=lambda k: s[k])
s1

In [None]:
s2 = [0] * 6
for idx, a in enumerate(s1):
  s2[a] = idx


In [None]:
s2

In [None]:
def get_rank(s):
  l = len(s)
  s1 = sorted(range(l), key=lambda k: s[k])
  s2 = [0] * l
  for idx, a in enumerate(s1):
    s2[a] = idx
  return s2


In [None]:
get_rank([2, 3, 1, 4, 5, 3])

In [None]:
df[emz_columns].apply(lambda x: get_rank(x), axis='columns')

In [None]:
df[emz_columns]