##### Common Alpha Factors

In [1]:
# Parameters
top = 10  # default value

In [2]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [3]:
%matplotlib inline

from pathlib import Path
import numpy as np
import pandas as pd
import pandas_datareader.data as web

from pathlib import Path
import numpy as np
import pandas as pd
import gc

import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS
from sklearn.preprocessing import scale
import talib

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
sns.set_style('whitegrid')
idx = pd.IndexSlice
deciles = np.arange(.1, 1, .1).round(1)

##### Load Data

In [5]:
DATA_STORE = Path('/home/sayem/Desktop/Project/data/assets.h5')

lock_path = "/tmp/assets_h5_file.lock"  # Choose a path for the lock file

# top = 500

from filelock import FileLock

with FileLock(lock_path):
    with pd.HDFStore(DATA_STORE) as store:
        data = store[f'data/top{top}_dataset']


In [6]:
# # with pd.HDFStore(DATA_STORE) as store:
# #     store.put('factors/common', df_optimized)
# data.rename(columns={'market cap': \
#     'market_cap'}, inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25958 entries, ('AAPL', Timestamp('2013-01-03 00:00:00')) to ('NFLX', Timestamp('2023-08-11 00:00:00'))
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   open                25958 non-null  float32
 1   high                25958 non-null  float32
 2   low                 25958 non-null  float32
 3   close               25958 non-null  float32
 4   volume              25958 non-null  float64
 5   market_cap          25958 non-null  float64
 6   sector              25958 non-null  float32
 7   ret_frac_order      25958 non-null  float32
 8   ret_01d             25958 non-null  float32
 9   ret_02d             25958 non-null  float32
 10  ret_03d             25958 non-null  float32
 11  ret_04d             25958 non-null  float32
 12  ret_05d             25958 non-null  float32
 13  ret_10d             25958 non-null  float32
 14  ret_21d             2595

##### TA-Lib: Function Groups

List number of available functions by group

In [7]:
function_groups = ['Overlap Studies',
                   'Momentum Indicators',
                   'Volume Indicators',
                   'Volatility Indicators',
                   'Price Transform',
                   'Cycle Indicators',
                   'Pattern Recognition',
                   'Statistic Functions',
                   'Math Transform',
                   'Math Operators']

talib_grps = talib.get_function_groups()

In [8]:
import pandas as pd
import talib.abstract as ta
import logging
import talib
import numpy as np

# The 5 days can be seen as a very short-term trend.
# The 21 days can be used to determine the medium-term trend.
# The 63 days can be viewed as a longer-term trend.

def compute_talib_indicators(df, function_groups, timeperiods=[5, 21, 63]):
    """
    Compute indicators for the specified function groups using TA-Lib's Abstract API.
    
    Parameters:
    - df: DataFrame with columns 'open', 'high', 'low', 'close', and optionally 'volume'.
    - function_groups: List of function groups to compute.
    """
    
    # Organize data in the format needed for TA-Lib Abstract API
    inputs = {
        'open': df['open'].astype(float).values,
        'high': df['high'].astype(float).values,
        'low': df['low'].astype(float).values,
        'close': df['close'].astype(float).values,
        'volume': df['volume'].astype(float).values if \
            'volume' in df.columns else np.random.random(len(df))
    }

    talib_functions = {k: v for k, v in talib.get_function_groups().items() if k in function_groups}

    success_count = 0
    failure_count = 0
    
    for group, indicators in talib_functions.items():
        for indicator_name in indicators:
            for timeperiod in timeperiods:
                try:
                    indicator_func = ta.Function(indicator_name)

                    parameters = {}
                    if 'timeperiod' in indicator_func.parameters:
                        parameters['timeperiod'] = timeperiod

                    if indicator_name == 'MAVP':
                        inputs['periods'] = np.full(len(df), timeperiod, dtype=np.float64)

                    outputs = indicator_func(inputs, **parameters)

                    # For multi-output functions, like 'BBANDS'
                    if isinstance(outputs, (list, tuple)):
                        for i, out_name in enumerate(indicator_func.output_names):
                            col_name = f"{out_name}_{timeperiod}".upper()
                            df[col_name] = outputs[i]
                    else:
                        col_name = f"{indicator_name}_{timeperiod}".upper()
                        df[col_name] = outputs

                    success_count += 1
                except Exception as e:
                    logging.warning(f"{indicator_name}_{timeperiod} failed due to: {str(e)}")
                    failure_count += 1

    print(f"{success_count} indicators added successfully.")
    print(f"{failure_count} indicators failed.")

    return df

function_groups = [
    'Overlap Studies',
    'Momentum Indicators',
    'Volume Indicators',
    'Volatility Indicators',
    'Price Transform',
    'Cycle Indicators',
    'Pattern Recognition',
    'Statistic Functions',
    'Math Transform',
    'Math Operators'
]

# Assuming you've read your dataframe into a variable named 'data'
data_ta = compute_talib_indicators(data.copy(), function_groups)

474 indicators added successfully.
0 indicators failed.


In [9]:
data_ta.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25958 entries, ('AAPL', Timestamp('2013-01-03 00:00:00')) to ('NFLX', Timestamp('2023-08-11 00:00:00'))
Columns: 526 entries, open to OBV_63
dtypes: float32(29), float64(299), int32(198)
memory usage: 81.9+ MB


In [10]:
from utils import optimize_dataframe
data_ta = optimize_dataframe(data_ta.copy())
data_ta.to_hdf(DATA_STORE, f'factor/top{top}_dataset_with_TA', \
    format='table', mode='a')

Column 'COSH_5' not downcasted to float32 due to its range.
Column 'COSH_21' not downcasted to float32 due to its range.
Column 'COSH_63' not downcasted to float32 due to its range.
Column 'EXP_5' not downcasted to float32 due to its range.
Column 'EXP_21' not downcasted to float32 due to its range.
Column 'EXP_63' not downcasted to float32 due to its range.
Column 'SINH_5' not downcasted to float32 due to its range.
Column 'SINH_21' not downcasted to float32 due to its range.
Column 'SINH_63' not downcasted to float32 due to its range.
Data memory before optimization: 81.85 MB
Data memory after optimization: 53.14 MB
Reduced by: 35.08%


In [11]:
data_ta.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25958 entries, ('AAPL', Timestamp('2013-01-03 00:00:00')) to ('NFLX', Timestamp('2023-08-11 00:00:00'))
Columns: 508 entries, open to OBV_63
dtypes: float32(295), float64(27), int32(186)
memory usage: 53.1 MB


In [12]:
del data_ta

##### Rolling Factor Betas

In [13]:
factor_data = (web.DataReader('F-F_Research_Data_5_Factors_2x3_daily', 'famafrench', 
                              start=2005)[0].rename(columns={'Mkt-RF': 'MARKET'}))
factor_data.index.names = ['date']

In [14]:
factors = factor_data.columns[:-1]
factors

Index(['MARKET', 'SMB', 'HML', 'RMW', 'CMA'], dtype='object')

In [15]:
t = 1
# ret = f'ret_{t:02}d'
ret = 'ret_frac_order'
windows = [21, 63, 252]
for window in windows:
    print(window)
    betas = []
    for ticker, df in data.groupby('ticker', group_keys=False):
        model_data = df[[ret]].merge(factor_data, on='date').dropna()
        model_data[ret] -= model_data.RF

        rolling_ols = RollingOLS(endog=model_data[ret], 
                                 exog=sm.add_constant(model_data[factors]), window=window)
        factor_model = rolling_ols.fit(params_only=True).params.rename(columns={'const':'ALPHA'})
        result = factor_model.assign(ticker=ticker).set_index('ticker', append=True).swaplevel()
        betas.append(result)
    betas = pd.concat(betas).rename(columns=lambda x: f'{x}_{window:02}')
    data = data.join(betas)

21
63
252


##### Size proxy

In [16]:
by_ticker = data.groupby('ticker', group_keys=False)
data['size_factor'] = by_ticker.close.apply(lambda x: x.fillna(method='bfill').div(x.iloc[0]))
data['size_proxy'] = data['market_cap'].mul(data.size_factor).div(1e6)

In [17]:
from utils import optimize_dataframe
data = optimize_dataframe(data.copy())
data.to_hdf(DATA_STORE, \
    f'factor/top{top}_dataset_with_rolling_beta_size_proxy', \
        format='table', mode='a')

Data memory before optimization: 8.30 MB
Data memory after optimization: 6.51 MB
Reduced by: 21.48%


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 25958 entries, ('AAPL', Timestamp('2013-01-03 00:00:00')) to ('NFLX', Timestamp('2023-08-11 00:00:00'))
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   open                25958 non-null  float32
 1   high                25958 non-null  float32
 2   low                 25958 non-null  float32
 3   close               25958 non-null  float32
 4   volume              25958 non-null  float64
 5   market_cap          25958 non-null  float64
 6   sector              25958 non-null  float32
 7   ret_frac_order      25958 non-null  float32
 8   ret_01d             25958 non-null  float32
 9   ret_02d             25958 non-null  float32
 10  ret_03d             25958 non-null  float32
 11  ret_04d             25958 non-null  float32
 12  ret_05d             25958 non-null  float32
 13  ret_10d             25958 non-null  float32
 14  ret_21d             2595

In [19]:
del data

In [20]:
from utils import clear_large_vars
clear_large_vars(threshold_size_in_MB=100)