##### Imports

In [1]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
from sklearn.feature_selection \
        import mutual_info_regression
        
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [3]:
from pathlib import Path
import pandas as pd
from filelock import FileLock
import dask.dataframe as dd

# Define the path
DATA_STORE = Path('/home/sayem/Desktop/Project/data/assets.h5')
lock_path = "/tmp/assets_h5_file.lock"  # Lock path

# Lock the file and retrieve the data
with FileLock(lock_path):
    # Use Dask to directly read the HDF5 files
    alpha_101_data_full = dd.read_hdf(DATA_STORE, key='factors/alpha_101').compute()
    common_data_full = dd.read_hdf(DATA_STORE, key='factors/common').compute()
    # with pd.HDFStore(DATA_STORE) as store:
    #     alpha_101_data_full = store['factors/alpha_101']
    #     common_data_full = store['factors/common']


end_date = pd.Timestamp('2023-08-11')
start_date = end_date - pd.tseries.offsets.BDay(5)

print(start_date)


alpha_101_data = alpha_101_data_full.loc[
    (alpha_101_data_full.index.get_level_values("date") >= start_date) &
    (alpha_101_data_full.index.get_level_values("date") <= end_date)
]

common_data = common_data_full.loc[
    (common_data_full.index.get_level_values("date") >= start_date) &
    (common_data_full.index.get_level_values("date") <= end_date)
]

del alpha_101_data_full
del common_data_full

2023-08-04 00:00:00


In [4]:
# Prepare an empty list to collect processed chunks
processed_data_list = []

# Given that `common_data` is the larger DataFrame, we'll chunk through that.
# Define a chunk size based on your system's memory; you can adjust it as needed.
CHUNK_SIZE = 100000

# Create function to chunk through DataFrame
def chunker(seq, size):
    return (seq[pos: pos + size] for pos in range(0, len(seq), size))

# Chunk through `common_data` based on the index
for start in range(0, common_data.shape[0], CHUNK_SIZE):
    end = start + CHUNK_SIZE
    
    # Selecting the current chunk
    common_data_chunk = common_data.iloc[start:end]
    
    # Extract unique tickers and dates from the current chunk
    tickers_in_chunk = common_data_chunk.index.get_level_values('ticker').unique()
    dates_in_chunk = common_data_chunk.index.get_level_values('date').unique()
    
    # Filter alpha_101_data based on tickers and dates in the current chunk
    filtered_alpha_101_data = alpha_101_data[
        alpha_101_data.index.get_level_values('ticker').isin(tickers_in_chunk) &
        alpha_101_data.index.get_level_values('date').isin(dates_in_chunk)
    ]
    
    # Merge current chunk with filtered alpha_101_data
    merged_chunk = common_data_chunk.merge(
        filtered_alpha_101_data, left_index=True, right_index=True, how='inner', suffixes=('', '_y')
    )

    # Drop any columns that end with "_y" suffix and "_x" (just in case, though "_x" might not appear in this context)
    merged_chunk = merged_chunk.drop(columns=[col for col in merged_chunk if col.endswith(('_y', '_x'))])

    processed_data_list.append(merged_chunk)

# Concatenate all processed chunks
final_data = pd.concat(processed_data_list)

print(f"Shape of the final combined data: {final_data.shape}")
print("Processing completed.")

del processed_data_list

Shape of the final combined data: (3105, 624)
Processing completed.


In [5]:
import pandas as pd

# 1. Capitalize all column names
final_data.columns = [col.upper() for \
    col in final_data.columns]

In [6]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3105 entries, ('AA', Timestamp('2023-08-04 00:00:00')) to ('ZTS', Timestamp('2023-08-11 00:00:00'))
Columns: 624 entries, OPEN to ALPHA_101
dtypes: float32(384), float64(33), int32(198), int8(9)
memory usage: 7.7+ MB


In [7]:
from utils import save_to_hdf

# Example usage
FILE_PATH = "/home/sayem/Desktop/Project/data/dataset.h5"
# Define key name
KEY_NAME_PREFIX = f'data/YEAR'
_ = save_to_hdf(final_data, FILE_PATH, KEY_NAME_PREFIX)
print(f'key is: {_}')

key is: data/YEAR_2023-08-04_to_2023-08-11


In [8]:
from utils import clear_large_vars
clear_large_vars()