In [None]:
# The aim of this module is to calculate features based on daily price movements

# File name convention
# R_ = raw extract 
# E_ = Entry files into enrichment layer
# C_ = enriched layer with calculation
# I = insights layer, designed for model baselines
# V_ = validation


In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'

# HIST_PRICE_DF = pd.read_excel (r'/Users/joezhou/Downloads/ALL_Prices.xlsx')
HIST_PRICE_DF_RAW = pd.read_csv (r'/Users/joezhou/Downloads/R_ALL_Prices.csv', sep='|')

#Create a sample for code testing purposes
# HIST_PRICE_DF = HIST_PRICE_DF_RAW[HIST_PRICE_DF_RAW['TickName'].isin(['BHP.AX','RIO.AX','TCL.AX','CBA.AX','MQG.AX','CSL.AX','NAB.AX','WBC.AX','SCG.AX','ANZ.AX','FMG.AX','WES.AX','TLS.AX','RMD.AX','WOW.AX','APA.AX','ATM.AX','GMG.AX','STO.AX','AMC.AX','MGR.AX','COL.AX','ALL.AX','NCM.AX','ZIP.AX'])]

HIST_PRICE_DF = HIST_PRICE_DF_RAW

#create a baselist for adding on the feature engineered variables
TICKER_LIST = HIST_PRICE_DF['TickName'].drop_duplicates().reset_index(drop=True)

In [2]:
# find ends of the data window
HIST_PRICE_DF_FIRST = HIST_PRICE_DF.groupby("TickName").first()
HIST_PRICE_DF_FIRST.rename(columns={"Open": "First_Open_price"}, inplace = True)


HIST_PRICE_DF_LAST = HIST_PRICE_DF.groupby("TickName").last()
HIST_PRICE_DF_LAST.rename(columns={"Close": "Last_Close_price"}, inplace = True)
HIST_PRICE_DF = HIST_PRICE_DF.merge(HIST_PRICE_DF_LAST[['Last_Close_price']], on = 'TickName',how = 'left')


# create calculated columns
HIST_PRICE_DF['Freq'] = 1

HIST_PRICE_DF['DAY_UP'] = np.where(HIST_PRICE_DF['Close'] - HIST_PRICE_DF['Open'] > 0, 1, 0)
HIST_PRICE_DF['DAY_DWN'] = np.where(HIST_PRICE_DF['Close'] - HIST_PRICE_DF['Open'] < 0, 1, 0)
HIST_PRICE_DF['DAY_FLAT'] = np.where(HIST_PRICE_DF['Close'] - HIST_PRICE_DF['Open'] == 0, 1, 0)

HIST_PRICE_DF['DAY_CHANGE_RATE'] = HIST_PRICE_DF['Close'] / HIST_PRICE_DF['Open'] - 1
HIST_PRICE_DF['DAY_CHANGE_NUM'] = HIST_PRICE_DF['Close'] - HIST_PRICE_DF['Open'] 

HIST_PRICE_DF['DAYS_ABOVE_LCLSE'] = np.where(HIST_PRICE_DF['Last_Close_price'] - HIST_PRICE_DF['Close'] > 0, 1, 0)
HIST_PRICE_DF['DAYS_BELOW_LCLSE'] = np.where(HIST_PRICE_DF['Last_Close_price'] - HIST_PRICE_DF['Close'] < 0, 1, 0)

# aggregate to ticker level for calcs
HIST_PRICE_DF_SUM = HIST_PRICE_DF.groupby("TickName")["Freq", "DAY_UP","DAY_DWN", "DAY_FLAT", "DAY_CHANGE_NUM", "DAY_CHANGE_RATE", "DAYS_ABOVE_LCLSE", "DAYS_BELOW_LCLSE"].sum()



  HIST_PRICE_DF_SUM = HIST_PRICE_DF.groupby("TickName")["Freq", "DAY_UP","DAY_DWN", "DAY_FLAT", "DAY_CHANGE_NUM", "DAY_CHANGE_RATE", "DAYS_ABOVE_LCLSE", "DAYS_BELOW_LCLSE"].sum()


In [3]:
# create ticker level file

DF_PRICE_INFO = HIST_PRICE_DF_SUM[["Freq", "DAY_UP","DAY_DWN", "DAY_FLAT", "DAYS_ABOVE_LCLSE", "DAYS_BELOW_LCLSE"]].merge(HIST_PRICE_DF_FIRST[['First_Open_price']], on = 'TickName',how = 'left')
DF_PRICE_INFO = DF_PRICE_INFO.merge(HIST_PRICE_DF_LAST[['Last_Close_price']], on = 'TickName',how = 'left')

# calculate at ticker level
DF_PRICE_INFO['PROB_HIGH'] = DF_PRICE_INFO['DAY_UP'] / DF_PRICE_INFO['Freq']
DF_PRICE_INFO['PROB_DWN'] = DF_PRICE_INFO['DAY_DWN'] / DF_PRICE_INFO['Freq']
DF_PRICE_INFO['PROB_FLAT'] = DF_PRICE_INFO['DAY_FLAT'] / DF_PRICE_INFO['Freq']

DF_PRICE_INFO['PROB_DAY_ABOVE_LCLSE'] = DF_PRICE_INFO['DAYS_ABOVE_LCLSE'] / DF_PRICE_INFO['Freq']
DF_PRICE_INFO['PROB_DAY_BELOW_LCLSE'] = DF_PRICE_INFO['DAYS_BELOW_LCLSE'] / DF_PRICE_INFO['Freq']

DF_PRICE_INFO['CHG_RTE_PRICE'] = DF_PRICE_INFO['Last_Close_price'] / DF_PRICE_INFO['First_Open_price'] -1

del HIST_PRICE_DF_SUM, HIST_PRICE_DF_FIRST, HIST_PRICE_DF_LAST

In [53]:

# Create session level features
# HIST_PRICE_DF['TickName_Plus'] = HIST_PRICE_DF['TickName'] + HIST_PRICE_DF['C_CloseHigher'].astype(str)

# HIST_PRICE_DF['TickName_Plus'] = HIST_PRICE_DF['TickName'] + HIST_PRICE_DF['C_CloseHigher'].astype(str)
# HIST_PRICE_DF['run_num'] = HIST_PRICE_DF['TickName_Plus'].ne(HIST_PRICE_DF['TickName_Plus'].shift()).cumsum()
# HIST_PRICE_DF['run_length'] = HIST_PRICE_DF.groupby((HIST_PRICE_DF['TickName_Plus'] != HIST_PRICE_DF['TickName_Plus'].shift(1)).cumsum()).cumcount()+1
# HIST_PRICE_DF['vol'] = HIST_PRICE_DF.groupby(HIST_PRICE_DF['TickName_Plus'] != (HIST_PRICE_DF['TickName_Plus'].shift()).sum()

# HIST_PRICE_DF['TickName_PlusSession'] = HIST_PRICE_DF['TickName'] + HIST_PRICE_DF['run_num'].astype(str)


# HIST_PRICE_DF['Sess_movement'] = HIST_PRICE_DF.groupby('TickName_PlusSession')['CLOSEdivideOPEN'].cumsum()
# HIST_PRICE_DF.loc[ HIST_PRICE_DF['C_CloseHigher']  == 1 , ['Sess_movement_high'] ] = HIST_PRICE_DF['Sess_movement']
# HIST_PRICE_DF.loc[ HIST_PRICE_DF['C_CloseLower']  == 1 , ['Sess_movement_low'] ] = HIST_PRICE_DF['Sess_movement'].abs()

# PRICE_MEDIAN = HIST_PRICE_DF.groupby("TickName")["absmvmtOPEN"].median()



In [57]:
# https://stackoverflow.com/questions/48276812/how-to-calculate-running-total-and-reset-when-value-change-with-python


# HIST_PRICE_DF_Sess = HIST_PRICE_DF.groupby(['TickName', 'TickName_PlusSession'])['run_length','Sess_movement_low','Sess_movement_high'].max()

# HIST_PRICE_DF_med = HIST_PRICE_DF.groupby(['TickName'])['HIGHtakeLOW','CLOSEtakeOPEN','HIGHdivideLOW','CLOSEdivideOPEN','V_CloseHigher','V_CloseLower'].median().add_suffix('_med')
# HIST_PRICE_DF_Sum = HIST_PRICE_DF.groupby(['TickName'])['Freq','C_CloseHigher','C_CloseLower','C_LOWaboveOpen'].sum().add_suffix('_sum')
# HIST_PRICE_DF_max = HIST_PRICE_DF.groupby(['TickName'])['run_length','High','Close'].max().add_suffix('_max')


# HIST_PRICE_DF_ADDED = pd.merge(HIST_PRICE_DF_Sum,HIST_PRICE_DF_med,on='TickName',how='left').merge(HIST_PRICE_DF_max,on='TickName',how='left')


# Calculate probability of increase
# HIST_PRICE_DF_ADDED['P_CloseHigher'] = HIST_PRICE_DF_ADDED['C_CloseHigher_sum'] / HIST_PRICE_DF_ADDED['Freq_sum'] 
# HIST_PRICE_DF_ADDED['Mvmt_Likihood'] = (HIST_PRICE_DF_ADDED['P_CloseHigher']*HIST_PRICE_DF_ADDED['V_CloseHigher_med'])+((1-HIST_PRICE_DF_ADDED['V_CloseLower_med'])*HIST_PRICE_DF_ADDED['V_CloseLower_med'])



  HIST_PRICE_DF_med = HIST_PRICE_DF.groupby(['TickName'])['HIGHtakeLOW','CLOSEtakeOPEN','HIGHdivideLOW','CLOSEdivideOPEN','V_CloseHigher','V_CloseLower'].median().add_suffix('_med')
  HIST_PRICE_DF_Sum = HIST_PRICE_DF.groupby(['TickName'])['Freq','C_CloseHigher','C_CloseLower','C_LOWaboveOpen'].sum().add_suffix('_sum')
  HIST_PRICE_DF_max = HIST_PRICE_DF.groupby(['TickName'])['run_length','High','Close'].max().add_suffix('_max')


In [6]:
DF_PRICE_INFO.to_csv('/Users/joezhou/Downloads/E_ALL_Prices.csv', sep='|')
