##### Common Alpha Factors

In [1]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
%matplotlib inline

from pathlib import Path
import numpy as np
import pandas as pd
import pandas_datareader.data as web

from pathlib import Path
import numpy as np
import pandas as pd
import gc

import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS
from sklearn.preprocessing import scale
import talib

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
sns.set_style('whitegrid')
idx = pd.IndexSlice
deciles = np.arange(.1, 1, .1).round(1)

##### Load Data

In [4]:
DATA_STORE = Path('/home/sayem/Desktop/Project/data/assets.h5')

lock_path = "/tmp/assets_h5_file.lock"  # Choose a path for the lock file

top = 250

from filelock import FileLock

with FileLock(lock_path):
    with pd.HDFStore(DATA_STORE) as store:
        data = store[f'data/top{top}_dataset']


In [5]:
# # with pd.HDFStore(DATA_STORE) as store:
# #     store.put('factors/common', df_optimized)
# data.rename(columns={'market cap': \
#     'market_cap'}, inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 671463 entries, ('AA', Timestamp('2013-01-03 00:00:00')) to ('ZTS', Timestamp('2023-08-11 00:00:00'))
Data columns (total 31 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   open                671463 non-null  float32
 1   high                671463 non-null  float32
 2   low                 671463 non-null  float32
 3   close               671463 non-null  float32
 4   volume              671463 non-null  float64
 5   market_cap          671463 non-null  float64
 6   sector              671463 non-null  float32
 7   ret_frac_order      671463 non-null  float32
 8   ret_01d             671463 non-null  float32
 9   ret_02d             671463 non-null  float32
 10  ret_03d             671463 non-null  float32
 11  ret_04d             671463 non-null  float32
 12  ret_05d             671463 non-null  float32
 13  ret_10d             671463 non-null  float32
 14  ret_21d   

##### TA-Lib: Function Groups

List number of available functions by group

In [6]:
function_groups = ['Overlap Studies',
                   'Momentum Indicators',
                   'Volume Indicators',
                   'Volatility Indicators',
                   'Price Transform',
                   'Cycle Indicators',
                   'Pattern Recognition',
                   'Statistic Functions',
                   'Math Transform',
                   'Math Operators']

talib_grps = talib.get_function_groups()

In [7]:
import pandas as pd
import talib.abstract as ta
import logging
import talib
import numpy as np

# The 5 days can be seen as a very short-term trend.
# The 21 days can be used to determine the medium-term trend.
# The 63 days can be viewed as a longer-term trend.

def compute_talib_indicators(df, function_groups, timeperiods=[5, 21, 63]):
    """
    Compute indicators for the specified function groups using TA-Lib's Abstract API.
    
    Parameters:
    - df: DataFrame with columns 'open', 'high', 'low', 'close', and optionally 'volume'.
    - function_groups: List of function groups to compute.
    """
    
    # Organize data in the format needed for TA-Lib Abstract API
    inputs = {
        'open': df['open'].astype(float).values,
        'high': df['high'].astype(float).values,
        'low': df['low'].astype(float).values,
        'close': df['close'].astype(float).values,
        'volume': df['volume'].astype(float).values if \
            'volume' in df.columns else np.random.random(len(df))
    }

    talib_functions = {k: v for k, v in talib.get_function_groups().items() if k in function_groups}

    success_count = 0
    failure_count = 0
    
    for group, indicators in talib_functions.items():
        for indicator_name in indicators:
            for timeperiod in timeperiods:
                try:
                    indicator_func = ta.Function(indicator_name)

                    parameters = {}
                    if 'timeperiod' in indicator_func.parameters:
                        parameters['timeperiod'] = timeperiod

                    if indicator_name == 'MAVP':
                        inputs['periods'] = np.full(len(df), timeperiod, dtype=np.float64)

                    outputs = indicator_func(inputs, **parameters)

                    # For multi-output functions, like 'BBANDS'
                    if isinstance(outputs, (list, tuple)):
                        for i, out_name in enumerate(indicator_func.output_names):
                            col_name = f"{out_name}_{timeperiod}".upper()
                            df[col_name] = outputs[i]
                    else:
                        col_name = f"{indicator_name}_{timeperiod}".upper()
                        df[col_name] = outputs

                    success_count += 1
                except Exception as e:
                    logging.warning(f"{indicator_name}_{timeperiod} failed due to: {str(e)}")
                    failure_count += 1

    print(f"{success_count} indicators added successfully.")
    print(f"{failure_count} indicators failed.")

    return df

function_groups = [
    'Overlap Studies',
    'Momentum Indicators',
    'Volume Indicators',
    'Volatility Indicators',
    'Price Transform',
    'Cycle Indicators',
    'Pattern Recognition',
    'Statistic Functions',
    'Math Transform',
    'Math Operators'
]

# Assuming you've read your dataframe into a variable named 'data'
data_ta = compute_talib_indicators(data.copy(), function_groups)

474 indicators added successfully.
0 indicators failed.


In [8]:
data_ta.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 671463 entries, ('AA', Timestamp('2013-01-03 00:00:00')) to ('ZTS', Timestamp('2023-08-11 00:00:00'))
Columns: 526 entries, open to OBV_63
dtypes: float32(29), float64(299), int32(198)
memory usage: 2.1+ GB


In [9]:
from utils import optimize_dataframe
data_ta = optimize_dataframe(data_ta.copy())
data_ta.to_hdf(DATA_STORE, f'factor/top{top}_dataset_with_TA', \
    format='table', mode='a')

Data memory before optimization: 2115.85 MB
Data memory after optimization: 1414.01 MB
Reduced by: 33.17%


In [10]:
data_ta.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 671463 entries, ('AA', Timestamp('2013-01-03 00:00:00')) to ('ZTS', Timestamp('2023-08-11 00:00:00'))
Columns: 520 entries, open to OBV_63
dtypes: float32(291), float64(31), int32(198)
memory usage: 1.4 GB


In [None]:
data_ta

In [11]:
del data_ta

##### Rolling Factor Betas

In [12]:
factor_data = (web.DataReader('F-F_Research_Data_5_Factors_2x3_daily', 'famafrench', 
                              start=2005)[0].rename(columns={'Mkt-RF': 'MARKET'}))
factor_data.index.names = ['date']

In [13]:
factors = factor_data.columns[:-1]
factors

Index(['MARKET', 'SMB', 'HML', 'RMW', 'CMA'], dtype='object')

In [14]:
t = 1
# ret = f'ret_{t:02}d'
ret = 'ret_frac_order'
windows = [21, 63, 252]
for window in windows:
    print(window)
    betas = []
    for ticker, df in data.groupby('ticker', group_keys=False):
        model_data = df[[ret]].merge(factor_data, on='date').dropna()
        model_data[ret] -= model_data.RF

        rolling_ols = RollingOLS(endog=model_data[ret], 
                                 exog=sm.add_constant(model_data[factors]), window=window)
        factor_model = rolling_ols.fit(params_only=True).params.rename(columns={'const':'ALPHA'})
        result = factor_model.assign(ticker=ticker).set_index('ticker', append=True).swaplevel()
        betas.append(result)
    betas = pd.concat(betas).rename(columns=lambda x: f'{x}_{window:02}')
    data = data.join(betas)

21
63
252


##### Size proxy

In [15]:
by_ticker = data.groupby('ticker', group_keys=False)
data['size_factor'] = by_ticker.close.apply(lambda x: x.fillna(method='bfill').div(x.iloc[0]))
data['size_proxy'] = data['market_cap'].mul(data.size_factor).div(1e6)

In [16]:
from utils import optimize_dataframe
data = optimize_dataframe(data.copy())
data.to_hdf(DATA_STORE, \
    f'factor/top{top}_dataset_with_rolling_beta_size_proxy', \
        format='table', mode='a')

Data memory before optimization: 1140.39 MB
Data memory after optimization: 859.23 MB
Reduced by: 24.66%


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4094783 entries, ('AA', Timestamp('2013-01-03 00:00:00')) to ('ZTS', Timestamp('2023-08-11 00:00:00'))
Data columns (total 51 columns):
 #   Column              Dtype  
---  ------              -----  
 0   open                float32
 1   high                float32
 2   low                 float32
 3   close               float32
 4   volume              float64
 5   market_cap          float64
 6   sector              float32
 7   ret_frac_order      float32
 8   ret_01d             float32
 9   ret_02d             float32
 10  ret_03d             float32
 11  ret_04d             float32
 12  ret_05d             float32
 13  ret_10d             float32
 14  ret_21d             float32
 15  ret_42d             float32
 16  ret_63d             float32
 17  ret_126d            float32
 18  ret_252d            float32
 19  ret_fwd_frac_order  float32
 20  ret_fwd_01d         float32
 21  ret_fwd_02d         float32
 22  ret_fwd_03d      

In [18]:
del data

In [19]:
from utils import clear_large_vars
clear_large_vars(threshold_size_in_MB=100)

NameError: name 'STOP' is not defined

##### Joining the dataframe

In [None]:
import pandas as pd
from pathlib import Path

# Define the path to your data store
DATA_STORE = Path('/home/sayem/Desktop/Project/data/assets.h5')
CHUNK_SIZE = 10 ** 5  # Define the chunk size based on available memory

# Load the secondary dataset just once for efficiency
beta_size_data = pd.read_hdf(DATA_STORE, 'data/top500_dataset_with_rolling_beta_size_proxy')

# Read the primary dataset in chunks
ta_data_chunks = pd.read_hdf(DATA_STORE, 'data/top500_dataset_with_TA', chunksize=CHUNK_SIZE)

# Prepare an empty list to collect processed chunks
processed_data_list = []

# # Display the initial sizes of datasets
# print(f"Initial shape of beta_size_data: {beta_size_data.shape}")
# print(f"Initial shape of each ta_data_chunks: {CHUNK_SIZE} rows")

for chunk in ta_data_chunks:
    
    # Calculate common columns count for this chunk
    common_columns_count = len(set(beta_size_data.columns).intersection(set(chunk.columns)))
    
    # Indicate progress
    start_val = (chunk.index.get_level_values('ticker')[0], chunk.index.get_level_values('date')[0])
    end_val = (chunk.index.get_level_values('ticker')[-1], chunk.index.get_level_values('date')[-1])
    print(f"Processing rows from {start_val} to {end_val}...")
    
    # Extract unique tickers and dates from the current chunk
    tickers_in_chunk = chunk.index.get_level_values('ticker').unique()
    dates_in_chunk = chunk.index.get_level_values('date').unique()
    
    # Filter beta_size_data based on tickers and dates in the current chunk
    filtered_beta_size_data = beta_size_data[
        beta_size_data.index.get_level_values('ticker').isin(tickers_in_chunk) &
        beta_size_data.index.get_level_values('date').isin(dates_in_chunk)
    ]
    
    # Merge chunk with filtered data
    merged_chunk = chunk.merge(
        filtered_beta_size_data, left_index=True, right_index=True, how='inner', suffixes=('', '_y')
    )
    
    # Drop "_y" suffixed columns (as they come from the secondary dataset)
    merged_chunk = merged_chunk.drop(columns=[col for col in merged_chunk if col.endswith('_y')])

    processed_data_list.append(merged_chunk)

# Concatenate all processed chunks
final_data = pd.concat(processed_data_list)

print(f"Shape of the final combined data: {final_data.shape}")
print("Processing completed.")

In [None]:
import pandas as pd
import gc

def static_columns(df):
    """Generator that yields static columns of the dataframe."""
    for col in df.columns:
        if df[col].nunique() == 1:
            yield col

def optimize_dataframe(df):
    print(f"Initial dataframe memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Remove static columns
    cols_to_drop = list(static_columns(df))
    df.drop(columns=cols_to_drop, inplace=True)
    print(f"Memory after removing static columns: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Convert float64 to float32
    float64_cols = df.select_dtypes(include=['float64']).columns
    df[float64_cols] = df[float64_cols].apply(pd.to_numeric, downcast='float')
    print(f"Memory after converting float64 columns: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Convert int64 to int32 (or smaller)
    int64_cols = df.select_dtypes(include=['int64']).columns
    df[int64_cols] = df[int64_cols].apply(pd.to_numeric, downcast='integer')
    print(f"Memory after converting int64 columns: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Fill NaN values
    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)
    print(f"Memory after filling NaN values: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    gc.collect()  # Explicitly run the garbage collector

    return df

# Assuming your dataframe is named df
optimized_df = optimize_dataframe(final_data)

In [None]:
optimized_df.info()

In [None]:
import pandas as pd

def efficient_save_to_hdf(dataframe, file_path, key, chunk_size=10**2, complevel=5, 
                          complib='zlib', overwrite_existing_key=True):

    with pd.HDFStore(file_path, mode='a', complevel=complevel, complib=complib) as store:
        if overwrite_existing_key and key in store:
            del store[key]
        # Chunk-wise conversion and append to HDFStore
        for i in range(0, len(dataframe), chunk_size):
            chunk = dataframe.iloc[i:i+chunk_size].copy()
            
            print(f"Appending chunk {i} to {i+chunk_size}")
            store.append(key, chunk, format='table', data_columns=True)
    
    print(f"Data saved to {file_path} under key {key}")
# Use the function
key = 'factors/common'
efficient_save_to_hdf(optimized_df, DATA_STORE, \
    key, overwrite_existing_key=True)

In [None]:
from utils import clear_large_vars
clear_large_vars(threshold_size_in_MB=100)