# Validation of (Co)MIGHT with Cancer data Features

Here, we do some basic validation on feature-sets from the cancer data.

In [3]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.special import expit
from sklearn.datasets import (
    make_blobs,
    make_classification,
    make_sparse_spd_matrix,
    make_spd_matrix,
)

from treeple import HonestForestClassifier, RandomForestClassifier, RandomForestRegressor
from treeple.datasets.multiview import make_gaussian_mixture, make_joint_factor_model
from treeple.stats import (
    FeatureImportanceForestClassifier,
    FeatureImportanceForestRegressor,
    PermutationForestRegressor,
)
from treeple.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier

seed = 12345
rng = np.random.default_rng(seed)

In [4]:
big_data = pd.read_pickle("/Users/adam2392/Downloads/wise.matrix.pkl")
big_data["Cancer Status"] = big_data["Cancer Status"].replace("Cancer", 1)
big_data["Cancer Status"] = big_data["Cancer Status"].replace("Healthy", 0)
display(big_data.head())
print(big_data.shape)

Unnamed: 0,Run,Sample,Library,Cancer Status,Stage,Tumor type,1:1-5000000,1:5000001-10000000,1:10000001-15000000,1:15000001-20000000,...,22:5000001-10000000,22:10000001-15000000,22:15000001-20000000,22:20000001-25000000,22:25000001-30000000,22:30000001-35000000,22:35000001-40000000,22:40000001-45000000,22:45000001-50000000,22:50000001-55000000
0,S0028,INDI_918_PLS_1A,A387-12,1,IV,Stomach,0.678938,-0.509823,-2.304186,0.0,...,0.0,0.0,0.0,0.0,-2.614654,-2.039435,-0.367407,-0.268217,0.782171,3.144373
1,S0028,INDI_980_PLS_1,A387-13,1,IV,Stomach,0.032226,-0.256921,-3.562167,0.0,...,0.0,0.0,0.0,0.0,-1.028307,-2.176704,0.298438,-1.45108,0.951544,1.574838
2,S0034,INDI_580_PLS_1A,A396-13,1,IV,Colorectal,-2.034504,-3.245896,-3.236149,0.0,...,0.0,0.0,0.0,0.0,-0.654188,-0.510688,-0.330978,-0.202724,0.116389,0.324193
3,S0034,INDI_730_PLS_1A,A396-04,1,IV,Pancreas,-3.441775,0.597126,-3.079712,0.0,...,0.0,0.0,0.0,0.0,-1.02177,-1.960194,-1.50984,-1.048233,2.889281,-2.051596
4,S0034,INDI_481_PLS_1A,A396-10,1,IV,Liver,0.394118,-0.545853,-2.749696,0.0,...,0.0,0.0,0.0,0.0,-2.826012,-2.06629,-0.264483,-1.442093,0.152686,0.433958


(1991, 595)


In [5]:
ids = pd.read_csv(
    "/Users/adam2392/Downloads/StageIVandControls.Train.samples.txt",
    header=None,
    sep=" ",
)

print(ids)

          0                 1
0     S0035   INDI_509_PLS_1A
1     S0181  INDIA_2884_PLS_1
2    S0134B  INDIA_2650_PLS_1
3     S0149  INDIA_3033_PLS_1
4     S0185  INDIA_2932_PLS_1
..      ...               ...
347   S0287  INDIA_4770_PLS_1
348   S0287  INDIA_4758_PLS_1
349   S0282  INDIA_3156_PLS_1
350   S0283  INDIA_3187_PLS_1
351   S0281  INDIA_3134_PLS_1

[352 rows x 2 columns]


In [6]:
big_data = big_data[big_data["Sample"].isin(ids[1])]
display(big_data.head())
print(big_data.shape)

Unnamed: 0,Run,Sample,Library,Cancer Status,Stage,Tumor type,1:1-5000000,1:5000001-10000000,1:10000001-15000000,1:15000001-20000000,...,22:5000001-10000000,22:10000001-15000000,22:15000001-20000000,22:20000001-25000000,22:25000001-30000000,22:30000001-35000000,22:35000001-40000000,22:40000001-45000000,22:45000001-50000000,22:50000001-55000000
1,S0028,INDI_980_PLS_1,A387-13,1,IV,Stomach,0.032226,-0.256921,-3.562167,0.0,...,0.0,0.0,0.0,0.0,-1.028307,-2.176704,0.298438,-1.45108,0.951544,1.574838
2,S0034,INDI_580_PLS_1A,A396-13,1,IV,Colorectal,-2.034504,-3.245896,-3.236149,0.0,...,0.0,0.0,0.0,0.0,-0.654188,-0.510688,-0.330978,-0.202724,0.116389,0.324193
3,S0034,INDI_730_PLS_1A,A396-04,1,IV,Pancreas,-3.441775,0.597126,-3.079712,0.0,...,0.0,0.0,0.0,0.0,-1.02177,-1.960194,-1.50984,-1.048233,2.889281,-2.051596
4,S0034,INDI_481_PLS_1A,A396-10,1,IV,Liver,0.394118,-0.545853,-2.749696,0.0,...,0.0,0.0,0.0,0.0,-2.826012,-2.06629,-0.264483,-1.442093,0.152686,0.433958
5,S0034,INDI_193_PLS_1A,A396-07,1,IV,Esophagus,-1.15697,-0.912828,-1.998913,0.0,...,0.0,0.0,0.0,0.0,-2.64252,-1.311603,-0.015934,-0.810916,0.127616,2.049014


(352, 595)


In [7]:
big_data_buff = big_data.iloc[:, 6:]

print(big_data_buff.shape)

(352, 589)


In [8]:
y = big_data["Cancer Status"]
print(y.shape)

(352,)


In [9]:
print(np.unique(y))

[0 1]


In [10]:
X = big_data_buff.to_numpy()
X = np.hstack((X, X))

print(X.shape)

(352, 1178)


In [11]:
feature_set_ends = [big_data_buff.shape[1], X.shape[1]]
print(feature_set_ends)
n_features = big_data_buff.shape[1]

[589, 1178]


In [12]:
clf = FeatureImportanceForestClassifier(
    estimator=HonestForestClassifier(
        n_estimators=500,
        random_state=seed + 2,
        n_jobs=1,
        honest_fraction=0.5,
        # tree_estimator=DecisionTreeClassifier(
        #     max_features=1.0,
        # ),
        tree_estimator=MultiViewDecisionTreeClassifier(
            feature_set_ends=feature_set_ends,
            max_features=1.0,
            apply_max_features_per_feature_set=False,
        ),
    ),
    test_size=0.2,
    random_state=seed + 1,
)

# first test MIGHT rejects the null, since there is information
stat, pvalue = clf.test(X, y, metric="mi")
print(pvalue)

clf.reset()
# second test CoMIGHT fails to reject the null, since the information
# is entirely contained in the first feature set
stat, pvalue = clf.test(X, y, covariate_index=np.arange(n_features), metric="mi")
print(pvalue)

0.000999000999000999
0.9210789210789211
