This notebook aims to recover the motivation examples with binary logit outcomes. In this example, we show how a 50% missingness biases the estimation of performance and PID.

In [1]:
import sys
sys.path.append('../')

from utils_generation import *
from utils_classification import *

from information_decomposition import *
from information_decomposition.utils import *
import torch

# For reproducibility
torch.manual_seed(0)
np.random.seed(0)

In [3]:
# Helper functions
def split(data, label):
    """
    Splits the data into train, val, test and all sets based on the label dataframe.
    Args:
        data (pd.DataFrame): The data to be split.
        label (pd.DataFrame): The labels indicating the split.
    Returns:
        dict: A dictionary containing the split data.
    """
    result = {
        'train': data.loc[label.loc[:, 'data_split'] == 'train'],
        'val': data.loc[label.loc[:, 'data_split'] == 'val'],
        'test': data.loc[label.loc[:, 'data_split'] == 'test'],
        'all': data
    }
    return result

def get_feature_dict_from_dataframes(x1_df, x2_df, label_df, standard_scaling = True):
    """
    Creates dictionary with normalised data splitted based on the label dataframe.
    Args:
        x1_df (pd.DataFrame): The first modality data.
        x2_df (pd.DataFrame): The second modality data.
        label_df (pd.DataFrame): The labels indicating the split.
        standard_scaling (bool): Whether to apply standard scaling to the data.
    Returns:
        dict: A dictionary containing the split data for each modality.
    """
    x1_x2_df = pd.concat([x1_df, x2_df], axis = 1)
    feature_dict = {}
    for modality_name, modality in [('x1', x1_df), ('x2', x2_df), ('x1_x2', x1_x2_df)] :
        modality_feature_dict = split(modality, label_df)
        if standard_scaling :
            scaler = StandardScaler()
            modality_feature_dict['train'] = pd.DataFrame(scaler.fit_transform(modality_feature_dict['train']), index=modality_feature_dict['train'].index, columns=modality_feature_dict['train'].columns)
            modality_feature_dict['val'] = pd.DataFrame(scaler.transform(modality_feature_dict['val']), index=modality_feature_dict['val'].index, columns=modality_feature_dict['val'].columns)
            modality_feature_dict['test'] = pd.DataFrame(scaler.transform(modality_feature_dict['test']), index=modality_feature_dict['test'].index, columns=modality_feature_dict['test'].columns)
            modality_feature_dict['all'] = pd.DataFrame(scaler.transform(modality_feature_dict['all']), index=modality_feature_dict['all'].index, columns=modality_feature_dict['all'].columns)

        feature_dict[modality_name] = modality_feature_dict

    return feature_dict

def get_prediction_dict_from_dataframes(feature_dict, label_dict, label_df, sample_weight = None, grid_search = {}) :
    """
    Trains MLP classifiers on the provided data and returns the prediction probabilities.
    Args:
        feature_dict (dict): A dictionary containing the feature data for each modality.
        label_dict (dict): A dictionary containing the labels for each split.
        label_df (pd.DataFrame): The labels indicating the split.
        sample_weight (dict): A dictionary containing sample weights for each split.
        grid_search (dict): A dictionary containing grid search parameters.
    Returns:
        dict: A dictionary containing the prediction probabilities for each modality.
    """
    prediction_dict = {}
    for modality_name in ['x1', 'x2', 'x1_x2'] :
        prediction_dict[modality_name] = train_mlp_and_get_prediction_probabilities(X_train = feature_dict[modality_name]['train'],
                                                                                    y_train = label_dict['train'],
                                                                                    X_val = feature_dict[modality_name]['val'],
                                                                                    y_val = label_dict['val'],
                                                                                    X = feature_dict[modality_name]['all'],
                                                                                    sample_weight = sample_weight['train'] if sample_weight is not None else None,
                                                                                    weight_val = sample_weight['val'] if sample_weight is not None else None,
                                                                                    grid_search = grid_search)
        prediction_dict[modality_name] = split(prediction_dict[modality_name], label_df)

    return prediction_dict

# Experiment

In [4]:
N_SAMPLES = 10000 # Number of samples
INFORMATION_SETTING = 'and' # Information setting for the experiment 'and' for AND, 'or' for OR, 'xor' for XOR

MISSINGNESS = True # Whether to apply missingness to the data
CORRECTION = False # Whether to apply correction to the data

In [5]:
# Generate synthetic data
x1_df = pd.DataFrame({'x1': np.random.rand(N_SAMPLES) > 0.5})
x2_df = pd.DataFrame({'x2': np.random.rand(N_SAMPLES) > 0.5})
label_df = pd.DataFrame({'data_split': np.random.choice(['train', 'val', 'test'], N_SAMPLES, p=[0.8, 0.1, 0.1])})
label_df['xor'] = ((x1_df.values | x2_df.values) & ~(x1_df.values & x2_df.values)).astype(int)
label_df['or'] = ((x1_df.values | x2_df.values)).astype(int)
label_df['and'] = ((x1_df.values & x2_df.values)).astype(int)

# Generate missingness patterns based on the first modality
if MISSINGNESS:
    proba_obs = (x1_df['x1'] * 0.6 + 0.2)
    observed = pd.Series(np.random.binomial(1, proba_obs.values) == 1, index = x1_df.index)

    # Estimate the inverse probability weights
    ipw = proba_obs.mean() / proba_obs
    ipw_dict = split(ipw[observed], label_df[observed])
else:
    observed = pd.Series(True, index = x1_df.index) 

# Split observed data
feature_dict = get_feature_dict_from_dataframes(x1_df[observed], x2_df[observed], label_df[observed], standard_scaling = True)
label_dict = split(label_df[observed][INFORMATION_SETTING], label_df[observed])

In [6]:
# Estimate the prediction given each modalities
grid_search = {'layers': [[32] * 2]}
prediction_dict = get_prediction_dict_from_dataframes(feature_dict, label_dict, label_df[observed], 
                                                      sample_weight = ipw_dict if CORRECTION else None, grid_search=grid_search) 

# Measure perfrormance
y_test = label_dict['test']
for modality_name, modality_predictions in prediction_dict.items() :
    y_pred = modality_predictions['test']
    auroc = []
    for bootstrap in range(100) :
        sample = np.random.choice(y_test.index, size = len(y_test), replace = True)
        auroc.append(roc_auc_score(y_true = y_test.loc[sample], y_score = y_pred.loc[sample], 
                                   sample_weight = ipw_dict['test'].loc[sample] if CORRECTION else None))
    print(f'AUC {modality_name:<10}: {np.mean(auroc):.2f} ({np.std(auroc):.2f})')

In [7]:
# Estimate Q
estimator = QEstimator(x1_train = feature_dict['x1']['train'].values,
                                 x2_train = feature_dict['x2']['train'].values,
                                 x1_val = feature_dict['x1']['val'].values,
                                 x2_val = feature_dict['x2']['val'].values,
                                 p_y_given_x1_train= prediction_dict['x1']['train'].values,
                                 p_y_given_x2_train= prediction_dict['x2']['train'].values,
                                 p_y_given_x1_val = prediction_dict['x1']['val'].values,
                                 p_y_given_x2_val = prediction_dict['x2']['val'].values,
                                 ipw_train= ipw_dict['train'].values if CORRECTION else None,
                                 ipw_val= ipw_dict['val'].values if CORRECTION else None,
                                 epochs = 100,
                                 device = "cuda:1" if torch.cuda.is_available() else "cpu",
                                 grid_search=grid_search)

In [8]:
# Compute PID
pid = pid_decomposition_batched(estimator, feature_dict['x1']['test'].values, feature_dict['x2']['test'].values,
                        prediction_dict['x1']['test'].values, prediction_dict['x2']['test'].values,
                        prediction_dict['x1_x2']['test'].values, label_dict['test'].values,
                        ipw_dict['test'].values if CORRECTION else None)
for p in pid:
    if 'std' in p:
        continue
    print(f'PID {p:<10}: {pid[p]:.2f} ({pid[p + "_std"]:.2f})')