In [1]:
import numpy as np

from causalchamber.datasets import Dataset
from crc.eval import compute_MCC
from scipy import stats
from sklearn.decomposition import FastICA


Fetching list of available datasets from https://causalchamber.s3.eu-central-1.amazonaws.com/downloadables/directory.yaml ... done.
If you use our datasets or models for your work please consider citing:

@article{gamella2024chamber,
  title={The Causal Chambers: Real Physical Systems as a Testbed for AI Methodology},
  author={Gamella, Juan L. and B"uhlmann, Peter and Peters, Jonas},
  journal={arXiv preprint arXiv:2404.11341},
  year={2024}
}



In [2]:
# Reproducibility
seed = 0
rs = np.random.RandomState(seed=seed)

In [3]:
ica_dataset1 = Dataset('lt_walks_v1', root='../data/chamber_downloads', download=True)

ica_experiment1 = ica_dataset1.get_experiment(name='color_mix')
ica_df1 = ica_experiment1.as_pandas_dataframe()

ica_sources1 = ica_df1[["red", "green", "blue"]].to_numpy()
ica_mixtures1 = ica_df1[["ir_1", "ir_2", "ir_3", "vis_1", "vis_2", "vis_3"]].to_numpy()

Dataset lt_walks_v1 found in "../data/chamber_downloads/lt_walks_v1".


In [5]:
# Try both: use all components and take those with highest correlation, or directly use n_ground_truth components
ica1 = FastICA(whiten_solver="eigh", random_state=rs)
ica_tf1 = ica1.fit_transform(ica_mixtures1)

In [6]:
# Get components with highest correlation with gt
gt_latent_dim = ica_sources1.shape[1]
tf_latent_dim = ica_tf1.shape[1]

max_corr_tf_list = []
for i in range(gt_latent_dim):
    corr_list = np.empty(tf_latent_dim)
    for j in range(tf_latent_dim):
        corr_list[j] = stats.pearsonr(ica_sources1[:, i], ica_tf1[:, j]).statistic
    max_corr_tf_list.append(ica_tf1[:, np.argmax(corr_list)])
ica_tf_max_corr1 = np.vstack(max_corr_tf_list).T

In [7]:
mcc1 = compute_MCC(ica_tf_max_corr1, ica_sources1, batch_size=10000)
print(F'MCC score (with max corr. selection): {np.mean(mcc1):.2f}')

MCC score (with max corr. selection): 84.67


In [8]:
ica2 = FastICA(n_components=ica_sources1.shape[1], whiten_solver="eigh", random_state=rs)
ica_tf2 = ica2.fit_transform(ica_mixtures1)

In [9]:
mcc2 = compute_MCC(ica_tf2, ica_sources1, batch_size=10000)
print(F'MCC score (with 3 components): {np.mean(mcc2):.2f}')

MCC score (with 3 components): 70.26


In [24]:
# Wt (L_in, L_out, H) -> P (nonlinear)
ica_dataset2 = Dataset('wt_walks_v1', root='../data/chamber_downloads', download=True)

ica_experiment2 = ica_dataset2.get_experiment(name='loads_hatch_mix_slow_run_1')
ica_df2 = ica_experiment2.as_pandas_dataframe()

ica_sources2 = ica_df2[["load_in", "load_out", "hatch"]].to_numpy()
ica_mixtures2 = ica_df2[["pressure_upwind", "pressure_downwind", "pressure_intake", 'pressure_ambient']].to_numpy()

Dataset wt_walks_v1 found in "../data/chamber_downloads/wt_walks_v1".


In [25]:
ica2 = FastICA(whiten_solver="eigh", random_state=rs)
ica_tf2 = ica2.fit_transform(ica_mixtures2)

In [29]:
# Get components with highest correlation with gt
gt_latent_dim = ica_sources2.shape[1]
tf_latent_dim = ica_tf2.shape[1]

max_corr_tf_list = []
for i in range(gt_latent_dim):
    corr_list = np.empty(tf_latent_dim)
    for j in range(tf_latent_dim):
        corr_list[j] = stats.pearsonr(ica_sources2[:, i], ica_tf2[:, j]).statistic
    max_corr_tf_list.append(ica_tf2[:, np.argmax(corr_list)])
ica_tf_max_corr2 = np.vstack(max_corr_tf_list).T

In [30]:
mcc2 = compute_MCC(ica_tf_max_corr2, ica_sources2, batch_size=10000)
print(F'MCC score (with max corr. selection): {np.mean(mcc2):.2f}')

MCC score (with max corr. selection): 60.53


In [None]:
# Nonlinear ICA (use some time series approach!)