In [19]:
import pandas as pd
import numpy as np

In [62]:
## functions necessary to group the minute level to hours/days
def get_first_element(x):
    return x.iloc[0]

def get_last_element(x):
    return x.iloc[len(x)-1]

# indicate the indicator as positive when the price is within specified threshold 
##ranges compared to the open for the long strategy
def infer_profit_long_indicator(x,threshold=1.02):
    if x['next_1high'] >= threshold*x['Open']:
        return 1
    else:
        return 0

def infer_profit_short_indicator(x,threshold=0.98):
    if x['next_1low'] <= threshold*x['Open']:
        return 1
    else:
        return 0
    
# to normalize the prices of 
def zscore_func_improved(x,window_size=50):
    rolling_mean=x.rolling(window=window_size).mean().bfill()
    rolling_std = x.rolling(window=window_size).std().bfill()
    return (x-rolling_mean)/rolling_std

## Preparing FCPO Data

In [3]:
### pre-process the excel file containing 2007-2017 data

# fcpo_data_10yrs_raw=pd.read_excel('data/FCPO.xlsx')

# fcpo_data_10yrs=fcpo_data_10yrs_raw.drop(columns=['General'])
# fcpo_data_10yrs=fcpo_data_10yrs.rename(columns={'High ':'High','Volume ':'Volume'})

# fcpo_data_10yrs['Date']=fcpo_data_10yrs['Date'].apply(lambda x:str(x))
# fcpo_data_10yrs['Hour']=fcpo_data_10yrs['Time'].apply(lambda x:str(x)[0:2])

# fcpo_data_10yrs_daily=fcpo_data_10yrs.groupby(['Date'],as_index=False).agg({'Open':lambda x: get_first_element(x),'Close': lambda x:get_last_element(x),
#                               'High':lambda x:np.max(x),'Low':lambda x:np.min(x),'Volume':'sum'})
# fcpo_data_10yrs_daily=fcpo_data_10yrs_daily.set_index(pd.to_datetime(fcpo_data_10yrs_daily['Date']))
# # handle the data difference with the 2014-2018 data
# fcpo_data_10yrs_daily[['Open','Close','High','Low']]=fcpo_data_10yrs_daily[['Open','Close','High','Low']].applymap(lambda x: x-178)
# fcpo_data_2009_2013_daily=fcpo_data_10yrs_daily['2009-01-01':'2014-01-01']

# fcpo_data_2009_2013_daily=fcpo_data_2009_2013_daily.drop(columns=['Date'])


### Process Back Adjusted Data

In [32]:
#pre-process the data from 2014-2018 csv file
fcpo_data=pd.read_csv('data/FCPO_2007-2017_backadjusted.csv')

#remove identity placeholder columns
fcpo_data=fcpo_data[fcpo_data['Time']!=1805]

#convert the date time values to string and derive the hour field
fcpo_data['Date']=fcpo_data['Date'].apply(lambda x:str(x))
fcpo_data['Hour']=fcpo_data['Time'].apply(lambda x:str(x)[0:2])

In [33]:
# Compute daily and hourly data frames
fcpo_data_daily=fcpo_data.groupby(['Date'],as_index=False).agg({'Open':lambda x: get_first_element(x),'Close': lambda x:get_last_element(x),
                              'High':lambda x:np.max(x),'Low':lambda x:np.min(x),'Volume':'sum'})

fcpo_data_daily=fcpo_data_daily.set_index(pd.to_datetime(fcpo_data_daily['Date']))

fcpo_data_daily=fcpo_data_daily.drop(columns=['Date'])

In [34]:
#fcpo_data_daily=fcpo_data_2014_2018_daily.copy()
## infer the profit indicators based on the future closing/high or low  prices
# shift the output of next day into the daily data frame
fcpo_data_daily=fcpo_data_daily.assign(next_1close=fcpo_data_daily['Close'].shift(-1),
                      next_1high=fcpo_data_daily['High'].shift(-1),
                      next_1low=fcpo_data_daily['Low'].shift(-1),
                      next_1open=fcpo_data_daily['Open'].shift(-1),
                      prev_1close=fcpo_data_daily['Close'].shift(1),                 
                      )

fcpo_data_daily=fcpo_data_daily.assign(next_open_change_pct=(fcpo_data_daily['Open']/fcpo_data_daily['prev_1close'])*100-100,
                            lprofit_ind=fcpo_data_daily.apply(lambda x:infer_profit_long_indicator(x,1.02),axis=1),
                            sprofit_ind=fcpo_data_daily.apply(lambda x:infer_profit_short_indicator(x,0.98),axis=1))

fcpo_data_daily['next_open_change_pct']=fcpo_data_daily['next_open_change_pct'].bfill()

In [14]:
fcpo_data_hourly=fcpo_data.groupby(['Date','Hour'],as_index=False).agg({'Open':lambda x: get_first_element(x),'Close': lambda x:get_last_element(x),
                              'High':lambda x:np.max(x),'Low':lambda x:np.min(x),'Volume':'sum'})

fcpo_data_hourly=fcpo_data_hourly.set_index(pd.to_datetime(fcpo_data_hourly['Date']+'-'+fcpo_data_hourly['Hour']))
fcpo_data_hourly=fcpo_data_hourly.drop(columns=['Date','Hour'])

In [15]:
#fcpo_data_hourly=fcpo_data_2014_2018_hourly.copy()
## infer the profit indicators based on the future closing prices
# shift the output of next 3 time periods into the daily data frame
fcpo_data_hourly=fcpo_data_hourly.assign(next_1close=fcpo_data_hourly['Close'].shift(-1),
                      next_1high=fcpo_data_hourly['High'].shift(-1),
                      next_1low=fcpo_data_hourly['Low'].shift(-1),
                      next_1open=fcpo_data_hourly['Open'].shift(-1))

fcpo_data_hourly=fcpo_data_hourly.assign(lprofit_ind=fcpo_data_hourly.apply(lambda x:infer_profit_long_indicator(x,1.01),axis=1),
                                       sprofit_ind=fcpo_data_hourly.apply(lambda x:infer_profit_short_indicator(x,0.99),axis=1))

## Preparing Dalian Palm oil data

In [16]:
palmoil_data=pd.read_excel('data/Dalian Palm Olein 1 Mins_updated.xlsx',skiprows=3)
palmoil_data=palmoil_data.drop(0)

palmoil_data['Date']=palmoil_data['Dates'].dt.date
palmoil_data['Hour']=palmoil_data['Dates'].dt.hour

# Compute daily and hourly data frames
palmoil_data_daily=palmoil_data.groupby(['Date'],as_index=False).agg({'Open':lambda x: get_first_element(x),'Close': lambda x:get_last_element(x),
                              'High':lambda x:np.max(x),'Low':lambda x:np.min(x),'Volume':'sum'})
palmoil_data_hourly=palmoil_data.groupby(['Date','Hour'],as_index=False).agg({'Open':lambda x: get_first_element(x),'Close': lambda x:get_last_element(x),
                              'High':lambda x:np.max(x),'Low':lambda x:np.min(x),'Volume':'sum'})

palmoil_data_daily=palmoil_data_daily.set_index(pd.to_datetime(palmoil_data_daily['Date']))

### Store the Processed data in HDF5 format for pandas

In [17]:
hdf_store=pd.HDFStore('processed_dta.h5')

hdf_store.put('fcpo_data_daily',fcpo_data_daily)

hdf_store.put('fcpo_data_hourly',fcpo_data_hourly)

hdf_store.put('palmoil_data_daily',palmoil_data_daily)

hdf_store.close()

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->date,key->block1_values] [items->['Date']]

  if self.run_code(code, result):
