In [2]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [3]:
%matplotlib inline

import numpy as np
import pandas as pd
from sklearn.feature_selection \
        import mutual_info_regression
        
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [4]:
import dask.dataframe as dd


# Define file path
FILE_PATH = "/home/sayem/Desktop/Project/data/dataset.h5"

# Read the dataset using Dask
data = dd.read_hdf(FILE_PATH, 'data/2018-01-02_to_2023-07-31')

# Compute the result (this will load data into memory)
result = data.compute()

In [5]:
result['TARGET'] = result['RET_FRAC_ORDER'].shift(-1)

In [6]:
result.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 8644608 entries, ('AA', Timestamp('2018-01-02 00:00:00')) to ('ZTS', Timestamp('2023-07-31 00:00:00'))
Columns: 614 entries, OPEN to TARGET
dtypes: float32(374), float64(33), int32(198), int8(9)
memory usage: 20.7+ GB


In [7]:
subset_result = result.loc[pd.IndexSlice[:, '2023-07-21':'2023-07-31'], :]
subset_result.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 155071 entries, ('AA', Timestamp('2023-07-21 00:00:00')) to ('ZTS', Timestamp('2023-07-31 00:00:00'))
Columns: 614 entries, OPEN to TARGET
dtypes: float32(374), float64(33), int32(198), int8(9)
memory usage: 379.4+ MB


In [8]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

def compute_correlation(data1, data2):
    if np.std(data1) == 0 or np.std(data2) == 0:
        return np.nan
    return np.corrcoef(data1, data2)[0, 1]

def calculate_ic(dataframe, target_column, corr_threshold=0.85, n_jobs=-1):
    original_shape = dataframe.shape  # Removed .compute()
    
    dataframe = dataframe.ffill().bfill()
    dataframe.dropna(axis=0, how='any', inplace=True)
    
    # Rank the data
    dataframe_ranked = dataframe.rank()
    target_ranked = dataframe_ranked[target_column].values
    
    features = [column for column in dataframe.columns if column != target_column]
    
    correlations = Parallel(n_jobs=n_jobs)(
        delayed(compute_correlation)(dataframe_ranked[column].values, target_ranked) for column in features
    )

    ic_original = pd.Series(dict(zip(features, correlations))).sort_values(ascending=False)
    
    # Compute correlations
    correlation_matrix = dataframe_ranked.corr()

    dropped_features = set()
    for col in ic_original.index:
        if col not in dropped_features:
            correlated_features = correlation_matrix[col]\
                [(correlation_matrix[col].abs() > corr_threshold) \
                    & (correlation_matrix[col].index != col)].index
            for feature in correlated_features:
                if ic_original[col] < ic_original[feature]:
                    dropped_features.add(col)
                else:
                    dropped_features.add(feature)

    ic_reduced = ic_original.drop(labels=dropped_features)
    
    reduced_dataframe = dataframe[ic_reduced.index.tolist() + [target_column]]
    reduced_shape = reduced_dataframe.shape

    return original_shape, reduced_shape, ic_original, ic_reduced, reduced_dataframe

# Using your data
original_shape, reduced_shape, \
    ic_original_values, ic_values, reduced_dataframe = calculate_ic(subset_result, 'TARGET')

print(f"Original shape: {original_shape}")
print(f"Reduced shape: {reduced_shape}")

In [None]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

def batched_corr(data, batch_size):
    n_cols = data.shape[1]
    batches = [(start, min(start + batch_size, n_cols)) for start in range(0, n_cols, batch_size)]
    
    correlation_matrix = pd.DataFrame(np.eye(n_cols), columns=data.columns, index=data.columns)
    for start, end in batches:
        partial_corr = data.iloc[:, start:end].corr(method='pearson')
        correlation_matrix.iloc[start:end, start:end] = partial_corr.values
    
    return correlation_matrix

def compute_correlation(data1, data2):
    # Check if std is zero
    if np.std(data1) == 0 or np.std(data2) == 0:
        return np.nan  # or some other value that you see fit
    return np.corrcoef(data1, data2)[0, 1]

def calculate_ic(dataframe, target_column, corr_threshold=0.85, n_jobs=-1, batch_size=100):
    original_shape = dataframe.shape
    
    dataframe = dataframe.ffill().bfill()
    dataframe.dropna(axis=0, how='any', inplace=True)
    
    # Rank the data to mimic Spearman's rank correlation using Pearson's
    dataframe_ranked = dataframe.rank()
    target_ranked = dataframe_ranked[target_column].values
    
    features = [column for column in dataframe.columns if column != target_column]
    
    correlations = Parallel(n_jobs=n_jobs)(
        delayed(compute_correlation)(dataframe_ranked[column].values, target_ranked) for column in features
    )

    ic_original = pd.Series(dict(zip(features, correlations))).sort_values(ascending=False)
    ic = ic_original.copy()
    
    correlation_matrix = batched_corr(dataframe_ranked[ic.index], batch_size=batch_size)

    dropped_features = set()
    for col in ic.index:
        if col not in dropped_features:
            correlated_features = correlation_matrix[col]\
                [(correlation_matrix[col].abs() > corr_threshold) \
                    & (correlation_matrix[col].index != col)].index
            for feature in correlated_features:
                if ic[col] < ic[feature]:
                    dropped_features.add(col)
                else:
                    dropped_features.add(feature)

    ic = ic.drop(labels=dropped_features)
    
    reduced_dataframe = dataframe[ic.index.tolist() + [target_column]]
    reduced_shape = reduced_dataframe.shape

    return original_shape, reduced_shape, ic_original, ic, reduced_dataframe

# Example usage
original_shape, reduced_shape, \
    ic_original_values, ic_values, \
        reduced_dataframe = calculate_ic(subset_result.copy(), 'TARGET')

print(f"Original shape: {original_shape}")
print(f"Reduced shape: {reduced_shape}")

In [None]:
reduced_dataframe.info()

In [None]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

def batched_corr(data, batch_size):
    n_cols = data.shape[1]
    batches = [(start, min(start + batch_size, n_cols)) for start in range(0, n_cols, batch_size)]
    
    correlation_matrix = pd.DataFrame(np.eye(n_cols), columns=data.columns, index=data.columns)
    for start, end in batches:
        partial_corr = data.iloc[:, start:end].corr(method='pearson')
        correlation_matrix.iloc[start:end, start:end] = partial_corr.values
    
    return correlation_matrix

def compute_correlation(data1, data2):
    # Check if std is zero
    if np.std(data1) == 0 or np.std(data2) == 0:
        return np.nan  # or some other value that you see fit
    return np.corrcoef(data1, data2)[0, 1]

def calculate_ic(dataframe, target_column, corr_threshold=0.85, n_jobs=-1, batch_size=100):
    dataframe = dataframe.ffill().bfill()
    dataframe.dropna(axis=0, how='any', inplace=True)
    # Rank the data to mimic Spearman's rank correlation using Pearson's
    dataframe_ranked = dataframe.rank()
    target_ranked = dataframe_ranked[target_column].values
    
    features = [column for column in dataframe.columns if column != target_column]
    
    correlations = Parallel(n_jobs=n_jobs)(
        delayed(compute_correlation)(dataframe_ranked[column].values, target_ranked) for column in features
    )

    ic_original = pd.Series(dict(zip(features, correlations))).sort_values(ascending=False)
    ic = ic_original.copy()
    
    correlation_matrix = batched_corr(dataframe_ranked[ic.index], batch_size=batch_size)

    dropped_features = set()
    for col in ic.index:
        if col not in dropped_features:
            correlated_features = correlation_matrix[col]\
                [(correlation_matrix[col].abs() > corr_threshold) \
                    & (correlation_matrix[col].index != col)].index
            for feature in correlated_features:
                if ic[col] < ic[feature]:
                    dropped_features.add(col)
                else:
                    dropped_features.add(feature)

    ic = ic.drop(labels=dropped_features)
    
    reduced_dataframe = dataframe[ic.index.tolist() + [target_column]]

    return ic_original, ic, reduced_dataframe

# Example usage
ic_original_values, ic_values, reduced_dataframe = calculate_ic(subset_result.copy(), 'TARGET')

In [None]:
reduced_dataframe.info()

In [None]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

def calculate_ic(dataframe, target_column, corr_threshold=0.85, n_jobs=-1):
    # Fill NaN values with the previous row's value (forward fill)
    dataframe = dataframe.ffill().bfill()
    
    # Rank the target data
    target_ranked = dataframe[target_column].rank().values
    
    def spearman_corr(feature_data, target_data):
        feature_ranked = feature_data.rank().values
        if np.std(feature_ranked) == 0 or np.std(target_data) == 0:
            return np.nan
        return np.corrcoef(feature_ranked, target_data)[0, 1]
    
    features = [column for column in dataframe.columns if column != target_column]
    correlations = Parallel(n_jobs=n_jobs)(
        delayed(spearman_corr)(dataframe[column], target_ranked) for column in features
    )
    
    correlations = dict(zip(features, correlations))
    ic_original = pd.Series(correlations).sort_values(ascending=False)
    ic = ic_original.copy()

    print(f"Number of features before removing correlated ones: {len(ic)}")

    correlation_matrix = dataframe[ic.index].corr(method='spearman')

    for col in ic.index:
        if col in correlation_matrix.columns: # Check if column is still in the matrix
            correlated_features = correlation_matrix[col][correlation_matrix[col].abs() > corr_threshold].index
            correlated_features = [f for f in correlated_features if f != col]  # Remove the original column from the list
            drop_feature = sorted([(f, ic[f]) for f in correlated_features], key=lambda x: x[1], reverse=True) # Sort by IC
            for feature, _ in drop_feature[1:]:  # Drop all but the first (with highest IC)
                correlation_matrix.drop(feature, axis=1, inplace=True)
                correlation_matrix.drop(feature, axis=0, inplace=True)
                ic.drop(feature, inplace=True)

    print(f"Number of features after removing correlated ones: {len(ic)}")
    
    reduced_dataframe = dataframe[ic.index.tolist() + [target_column]]

    return ic_original, ic, reduced_dataframe

# Example usage
# ic_original_values, ic_values, reduced_dataframe = calculate_ic(subset_result.copy(), 'TARGET')
# ic_values, ic, reduced_dataframe = calculate_ic(result.copy(), 'TARGET')

In [None]:
reduced_dataframe.info()