# Load Dependencies and Raw Data

In [1]:
import pandas as pd
import numpy as np
import dill, pickle
import copy

from collections import Counter
import itertools
from scipy import stats

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import FactorAnalysis

from skrebate import ReliefF, MultiSURF, MultiSURFstar
from sklearn.feature_selection import f_classif
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn import metrics
from sklearn.metrics import adjusted_rand_score, rand_score
from sklearn.metrics.cluster import pair_confusion_matrix

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [2]:
data = pd.read_excel('../data/GC-MS_data.xlsx')

In [3]:
# counts

In [4]:
ID_num = np.where(data.ID == 'Healthy', 0, 1)

In [5]:
data.insert(1, 'ID_num', ID_num)

# User Defined Functions

In [6]:
def do_factor_analysis(dataset):
    fa = FactorAnalysis().fit(dataset)
    return fa.mean_, fa.get_covariance()

In [7]:
def bhatt_dist(m1,cov1,m2,cov2):
    cov = (1/2) * (cov1 + cov2)
    
    Term1 = (1/8) * (m1 - m2).T @ np.linalg.inv(cov) @ (m1 - m2)
    Term2 = (1 / 2) * np.log(np.linalg.det(cov) / np.sqrt(np.linalg.det(cov1) * np.linalg.det(cov2)))
    
    return Term1+Term2, Term1, Term2
    

# Full Dataset

We need a metric that is better than randIndex. randIndex is label agnostic. in otherwords, if 2 instances for the negative class are clustered together it is a positive outcome for randIndex even if they are clustered in the same cluster as the positive instances.

Need to try out log transorfm of the data

In [8]:
data.head()

Unnamed: 0,ID,ID_num,M151T1,M83T1,M101T1,M46T1,M49T1,M66T1,M62T1,M80T1,...,M110T26_1,M121T26_1,M175T26_1,M124T26_1,M138T26_2,M85T26_2,M174T26,M123T26_1,M91T26,M94T26
0,Healthy,0,16.048388,6064.292377,781.993514,802.020931,4694.204132,72927.42,29491.232768,16.024791,...,7867.025232,8965.565729,3012.05177,4658.714325,2340.026084,12912.976187,1397.742432,16170.724464,10359.088145,4524.513159
1,Healthy,0,2966.261917,134774.774889,62570.597315,18605.033364,23763.284044,6268676.0,392830.377721,5189.617648,...,278936.880991,323372.411031,79971.903683,215630.572047,79703.41754,524443.694141,58370.37003,714891.705277,330847.552709,229032.80863
2,Healthy,0,8377.442132,54888.286191,7574.412552,24452.053496,8865.881271,812281.6,470305.922774,2883.659359,...,330297.442948,400547.227331,89001.852491,237332.546382,92068.988978,281554.565002,73030.972156,942769.300372,653550.454878,189904.532666
3,Healthy,0,37.609095,10440.984838,26.750624,88.92319,1827.264205,44911.65,8166.159143,1374.437507,...,1090.182759,998.434986,220.276019,648.220851,375.745446,1799.311194,106.666198,1713.346412,2279.341711,655.383157
4,Healthy,0,115.438628,7287.725513,112.578791,28.656514,5843.718655,157313.2,21056.733832,3441.212011,...,9453.592618,17333.718556,1887.790859,8186.847903,3752.941527,10702.205663,2548.9401,33661.377548,16464.790716,7288.223731


In [9]:
Counter(data.ID)

Counter({'Healthy': 29, 'Asthmatic': 14})

In [10]:
data.shape

(43, 2736)

# Scale the data


In [11]:
X_raw_all = data.values[:,2:]
X_scaled_all = StandardScaler().fit_transform(X_raw_all)

In [12]:
data_scaled_all = pd.DataFrame(X_scaled_all, columns = data.columns[2:])

In [13]:
data_scaled_all.insert(0, 'ID', data.ID.values)
data_scaled_all.insert(1, 'ID_num', data.ID_num.values)

In [14]:
data_scaled_all.head()

Unnamed: 0,ID,ID_num,M151T1,M83T1,M101T1,M46T1,M49T1,M66T1,M62T1,M80T1,...,M110T26_1,M121T26_1,M175T26_1,M124T26_1,M138T26_2,M85T26_2,M174T26,M123T26_1,M91T26,M94T26
0,Healthy,0,-0.268167,-0.173282,-0.468532,-0.689375,-0.545787,-0.619796,-0.409468,-0.387656,...,-0.564269,-0.599296,-0.417554,-0.599171,-0.582968,-0.536859,-0.421735,-0.635161,-0.577705,-0.546579
1,Healthy,0,-0.263205,-0.105216,0.307663,0.251702,-0.410207,0.588939,-0.072891,-0.365801,...,1.270077,1.268182,1.298543,1.453292,1.318163,2.372777,1.157498,1.002377,0.508829,1.744742
2,Healthy,0,-0.254105,-0.147462,-0.383205,0.560778,-0.516126,-0.475555,-0.001122,-0.375542,...,1.617637,1.726576,1.499898,1.664422,1.622035,0.9912,1.563878,1.536436,1.60287,1.345401
3,Healthy,0,-0.268131,-0.170967,-0.47802,-0.72707,-0.56617,-0.625262,-0.429223,-0.381917,...,-0.610128,-0.646618,-0.479807,-0.638188,-0.631238,-0.600075,-0.457523,-0.669043,-0.605098,-0.586067
4,Healthy,0,-0.268,-0.172635,-0.476942,-0.730256,-0.537614,-0.603333,-0.417282,-0.373187,...,-0.553533,-0.549592,-0.442624,-0.564847,-0.548247,-0.549434,-0.389825,-0.594169,-0.557005,-0.518373


In [15]:
data_healthy_all_df = data_scaled_all.loc[data_scaled_all.ID == 'Healthy']
data_asthma_all_df = data_scaled_all.loc[data_scaled_all.ID == 'Asthmatic']

In [16]:
data_healthy_all = data_healthy_all_df.values[:,2:]
data_asthma_all = data_asthma_all_df.values[:, 2:]

In [17]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy_all)

In [18]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma_all)

In [19]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)
print(dist)
print(np.linalg.det(healthy_cov))
print(np.linalg.det(asthma_cov))

  """


nan
0.0
0.0


# Relief Methods

In [20]:
data_scaled_df = data_scaled_all

In [21]:
data_scaled_df.head()

Unnamed: 0,ID,ID_num,M151T1,M83T1,M101T1,M46T1,M49T1,M66T1,M62T1,M80T1,...,M110T26_1,M121T26_1,M175T26_1,M124T26_1,M138T26_2,M85T26_2,M174T26,M123T26_1,M91T26,M94T26
0,Healthy,0,-0.268167,-0.173282,-0.468532,-0.689375,-0.545787,-0.619796,-0.409468,-0.387656,...,-0.564269,-0.599296,-0.417554,-0.599171,-0.582968,-0.536859,-0.421735,-0.635161,-0.577705,-0.546579
1,Healthy,0,-0.263205,-0.105216,0.307663,0.251702,-0.410207,0.588939,-0.072891,-0.365801,...,1.270077,1.268182,1.298543,1.453292,1.318163,2.372777,1.157498,1.002377,0.508829,1.744742
2,Healthy,0,-0.254105,-0.147462,-0.383205,0.560778,-0.516126,-0.475555,-0.001122,-0.375542,...,1.617637,1.726576,1.499898,1.664422,1.622035,0.9912,1.563878,1.536436,1.60287,1.345401
3,Healthy,0,-0.268131,-0.170967,-0.47802,-0.72707,-0.56617,-0.625262,-0.429223,-0.381917,...,-0.610128,-0.646618,-0.479807,-0.638188,-0.631238,-0.600075,-0.457523,-0.669043,-0.605098,-0.586067
4,Healthy,0,-0.268,-0.172635,-0.476942,-0.730256,-0.537614,-0.603333,-0.417282,-0.373187,...,-0.553533,-0.549592,-0.442624,-0.564847,-0.548247,-0.549434,-0.389825,-0.594169,-0.557005,-0.518373


## Relief-F

In [22]:
fs = ReliefF(discrete_threshold = 5, n_jobs=1)
fs.fit(data_scaled_df.values[:,2:].astype(float), data_scaled_df.ID_num.values)

feature_scores = fs.feature_importances_
feature_ids = np.where(feature_scores>=0)[0]
selected_features = np.array(data_scaled_df.columns[2:][feature_ids]) 

In [23]:
X_reliefF = data_scaled_df.values[:,2:][:,feature_ids]

In [24]:
X_reliefF.shape

(43, 262)

In [25]:
X_reliefF_df = pd.DataFrame(X_reliefF, columns = selected_features)
X_reliefF_df.insert(0, 'ID', data.ID.values)

In [26]:
X_reliefF_df.head()

Unnamed: 0,ID,M151T1,M83T1,M101T1,M46T1,M51T1,M50T1,M69T1,M41T2_1,M56T2_1,...,M179T17_1,M74T17_2,M45T17,M154T17_4,M141T19_1,M166T19,M103T19,M123T23_2,M160T24_1,M118T25_1
0,Healthy,-0.268167,-0.173282,-0.468532,-0.689375,-0.397119,-0.504227,-0.259295,-0.515416,-0.806464,...,-0.684569,-0.678013,-0.65555,-0.617682,-0.653828,-0.724431,-0.63664,-0.769704,-0.641001,-0.777737
1,Healthy,-0.263205,-0.105216,0.307663,0.251702,-0.0326679,0.403328,0.19694,2.10068,0.829804,...,4.54669,4.13252,4.13552,5.08838,0.706481,-0.00984325,-0.535609,-0.0779283,0.488144,1.91587
2,Healthy,-0.254105,-0.147462,-0.383205,0.560778,0.117618,-0.138625,0.167859,1.00659,0.757935,...,1.47263,-0.631554,-0.678092,-0.557608,0.173148,0.175616,-0.275008,-0.0775667,0.268835,0.782965
3,Healthy,-0.268131,-0.170967,-0.47802,-0.72707,-0.393612,-0.555889,-0.266302,-0.52864,-0.842443,...,-0.596558,-0.70113,-0.707574,-0.620823,-0.6057,-0.854671,-0.661433,-0.830507,-0.811968,-0.801357
4,Healthy,-0.268,-0.172635,-0.476942,-0.730256,-0.384417,-0.485635,-0.266156,-0.461565,-0.811969,...,-0.49071,-0.554093,-0.596447,-0.620428,-0.595307,-0.802005,-0.656975,-0.806074,-0.699333,-0.768384


In [27]:
data_healthy_df = X_reliefF_df.loc[X_reliefF_df.ID == 'Healthy']
data_asthma_df = X_reliefF_df.loc[X_reliefF_df.ID == 'Asthmatic']

In [28]:
data_healthy = data_healthy_df.values[:,1:]
data_asthma = data_asthma_df.values[:, 1:]

In [29]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy)

In [30]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma)

In [31]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)

  """


In [32]:
print(dist)
print(np.linalg.det(healthy_cov))
print(np.linalg.det(asthma_cov))

nan
0.0
0.0


## MultiSURF

In [34]:
fs = MultiSURF(discrete_threshold = 5, n_jobs=1)
fs.fit(data_scaled_df.values[:,2:].astype(float), data_scaled_df.ID_num.values)

feature_scores = fs.feature_importances_
feature_ids = np.where(feature_scores>=0)[0]
selected_features = np.array(data_scaled_df.columns[2:][feature_ids]) 

In [35]:
X_MultiSURF = data_scaled_df.values[:,2:][:,feature_ids]

In [36]:
X_MultiSURF.shape

(43, 896)

In [37]:
X_MultiSURF_df = pd.DataFrame(X_MultiSURF, columns = selected_features)
X_MultiSURF_df.insert(0, 'ID', data.ID.values)

In [38]:
X_MultiSURF_df.head()

Unnamed: 0,ID,M151T1,M83T1,M101T1,M46T1,M49T1,M66T1,M62T1,M135T1,M103T1,...,M125T26_3,M110T26_1,M121T26_1,M175T26_1,M124T26_1,M138T26_2,M85T26_2,M123T26_1,M91T26,M94T26
0,Healthy,-0.268167,-0.173282,-0.468532,-0.689375,-0.545787,-0.619796,-0.409468,-0.781463,-0.508837,...,-0.61655,-0.564269,-0.599296,-0.417554,-0.599171,-0.582968,-0.536859,-0.635161,-0.577705,-0.546579
1,Healthy,-0.263205,-0.105216,0.307663,0.251702,-0.410207,0.588939,-0.0728911,1.35577,3.17152,...,1.37484,1.27008,1.26818,1.29854,1.45329,1.31816,2.37278,1.00238,0.508829,1.74474
2,Healthy,-0.254105,-0.147462,-0.383205,0.560778,-0.516126,-0.475555,-0.00112197,1.5537,3.29832,...,1.63278,1.61764,1.72658,1.4999,1.66442,1.62203,0.9912,1.53644,1.60287,1.3454
3,Healthy,-0.268131,-0.170967,-0.47802,-0.72707,-0.56617,-0.625262,-0.429223,-0.738069,-0.540948,...,-0.645406,-0.610128,-0.646618,-0.479807,-0.638188,-0.631238,-0.600075,-0.669043,-0.605098,-0.586067
4,Healthy,-0.268,-0.172635,-0.476942,-0.730256,-0.537614,-0.603333,-0.417282,-0.792615,-0.470683,...,-0.538728,-0.553533,-0.549592,-0.442624,-0.564847,-0.548247,-0.549434,-0.594169,-0.557005,-0.518373


In [39]:
data_healthy_df = X_MultiSURF_df.loc[X_reliefF_df.ID == 'Healthy']
data_asthma_df = X_MultiSURF_df.loc[X_reliefF_df.ID == 'Asthmatic']

In [40]:
data_healthy = data_healthy_df.values[:,1:]
data_asthma = data_asthma_df.values[:, 1:]

In [41]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy)

In [42]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma)

In [43]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)

  """


In [44]:
print(dist)
print(np.linalg.det(healthy_cov))
print(np.linalg.det(asthma_cov))

nan
0.0
0.0


## MultiSURFStar

In [45]:
fs = MultiSURFstar(discrete_threshold = 5, n_jobs=1)
fs.fit(data_scaled_df.values[:,2:].astype(float), data_scaled_df.ID_num.values)

feature_scores = fs.feature_importances_
feature_ids = np.where(feature_scores>=0)[0]
selected_features = np.array(data_scaled_df.columns[2:][feature_ids]) 

In [46]:
X_MultiSURFStar = data_scaled_df.values[:,2:][:,feature_ids]

In [47]:
X_MultiSURFStar.shape

(43, 5)

In [48]:
X_MultiSURFStar_df = pd.DataFrame(X_MultiSURFStar, columns = selected_features)
X_MultiSURFStar_df.insert(0, 'ID', data.ID.values)

In [49]:
X_MultiSURFStar_df.head()

Unnamed: 0,ID,M98T3_1,M68T5,M70T6_1,M91T6,M109T12_2
0,Healthy,-0.698793,-0.627699,-0.677485,-0.582871,-0.59916
1,Healthy,1.72997,2.95831,0.776644,2.08052,2.50791
2,Healthy,1.70043,1.88355,1.79879,1.91838,0.766022
3,Healthy,-0.413427,-0.586846,-0.989364,-0.662185,-0.811215
4,Healthy,-0.720106,-0.620321,-0.959529,-0.637464,-0.789324


In [50]:
data_healthy_df = X_MultiSURFStar_df.loc[X_MultiSURFStar_df.ID == 'Healthy']
data_asthma_df = X_MultiSURFStar_df.loc[X_MultiSURFStar_df.ID == 'Asthmatic']

In [51]:
data_healthy = data_healthy_df.values[:,1:]
data_asthma = data_asthma_df.values[:, 1:]

In [52]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy)

In [53]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma)

In [54]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)

In [55]:
print(dist)
print(np.linalg.det(healthy_cov))
print(np.linalg.det(asthma_cov))

0.8781896505522983
0.10470358387067949
0.0049995991493347255


# Univariate Statistical Feature Selection

## Anova

In [56]:
f,p = f_classif(data_scaled_df.values[:,2:].astype(float), data.ID_num.values)
feature_ids = np.where(p<=0.05)[0]


selected_features = np.array(data_scaled_df.columns[2:][feature_ids]) 

X_anova = data_scaled_df.values[:,2:][:,feature_ids]

In [57]:
X_anova.shape

(43, 34)

In [58]:
X_anova_df = pd.DataFrame(X_anova, columns = selected_features)
X_anova_df.insert(0, 'ID', data.ID.values)

In [59]:
X_anova_df.head()

Unnamed: 0,ID,M151T1,M67T2_1,M90T2_2,M72T2_2,M82T2_2,M95T2,M91T2_1,M132T2_1,M97T3_1,...,M69T10_3,M94T11_1,M93T11_1,M39T11,M66T11_2,M62T11_1,M38T11,M40T11,M90T12,M110T12_2
0,Healthy,-0.268167,-0.459512,-0.482348,-0.645626,-0.855105,-0.942973,-0.689529,-0.750814,-0.0778075,...,-0.6107,-0.584152,-0.462755,-0.493958,-0.520622,-0.632254,-0.563194,-0.505035,-0.381817,-0.756397
1,Healthy,-0.263205,0.597175,1.88202,-0.944398,2.18404,0.444226,1.51344,1.51043,0.299322,...,1.28865,-0.550068,-0.581811,-0.176697,-0.484444,0.580592,-0.387506,-0.241508,3.28594,2.78272
2,Healthy,-0.254105,2.10178,1.31152,-0.148172,2.09556,2.53259,2.04408,1.88436,0.191514,...,1.76041,-0.549622,-0.538452,-0.280781,-0.498571,-0.193781,-0.0809946,-0.0698067,0.947865,0.878527
3,Healthy,-0.268131,-0.468733,-0.572338,-1.25737,-0.917285,-1.02991,-0.750121,-0.773842,0.131019,...,-0.621618,-0.723417,-0.531289,-0.640424,-0.68664,-0.738926,-0.57296,-0.581728,-0.697853,-0.536689
4,Healthy,-0.268,-0.456096,-0.55483,-1.28042,-1.30846,-1.04277,-0.768133,-0.799584,0.137262,...,-0.629418,-0.759001,-0.606578,-0.669164,-0.717645,-0.778315,-0.618214,-0.61991,-0.586106,-0.775964


In [60]:
data_healthy_df = X_anova_df.loc[X_anova_df.ID == 'Healthy']
data_asthma_df = X_anova_df.loc[X_anova_df.ID == 'Asthmatic']

In [61]:
data_healthy = data_healthy_df.values[:,1:]
data_asthma = data_asthma_df.values[:, 1:]

In [62]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy)

In [63]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma)

In [64]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)

  """


In [66]:
print(dist)
print(np.linalg.det(healthy_cov))
print(np.linalg.det(asthma_cov))

inf
1.2922052962755676e-110
2.6274739360408576e-256


# Combinations

## Anova + Relief-F

In [67]:
X_anova.shape

(43, 34)

In [68]:
fs = ReliefF(discrete_threshold = 5, n_jobs=1)
fs.fit(X_anova.astype(float), data.ID_num.values)

feature_scores = fs.feature_importances_
feature_ids = np.where(feature_scores>=0)[0]
selected_features = selected_features[feature_ids]

In [69]:
X_ano_reliefF = X_anova[:,feature_ids]

In [70]:
X_ano_reliefF.shape

(43, 26)

In [71]:
X_anova_relief_df = pd.DataFrame(X_ano_reliefF, columns = selected_features)
X_anova_relief_df.insert(0, 'ID', data.ID.values)

In [72]:
X_anova_relief_df.head()

Unnamed: 0,ID,M151T1,M72T2_2,M82T2_2,M95T2,M91T2_1,M132T2_1,M97T3_1,M77T3_1,M84T4_1,...,M108T7,M94T11_1,M93T11_1,M39T11,M66T11_2,M62T11_1,M38T11,M40T11,M90T12,M110T12_2
0,Healthy,-0.268167,-0.645626,-0.855105,-0.942973,-0.689529,-0.750814,-0.0778075,-0.655164,-0.874023,...,-0.541696,-0.584152,-0.462755,-0.493958,-0.520622,-0.632254,-0.563194,-0.505035,-0.381817,-0.756397
1,Healthy,-0.263205,-0.944398,2.18404,0.444226,1.51344,1.51043,0.299322,1.38727,0.839667,...,1.5753,-0.550068,-0.581811,-0.176697,-0.484444,0.580592,-0.387506,-0.241508,3.28594,2.78272
2,Healthy,-0.254105,-0.148172,2.09556,2.53259,2.04408,1.88436,0.191514,0.633136,1.78128,...,0.811305,-0.549622,-0.538452,-0.280781,-0.498571,-0.193781,-0.0809946,-0.0698067,0.947865,0.878527
3,Healthy,-0.268131,-1.25737,-0.917285,-1.02991,-0.750121,-0.773842,0.131019,-0.653796,-1.02205,...,-0.579382,-0.723417,-0.531289,-0.640424,-0.68664,-0.738926,-0.57296,-0.581728,-0.697853,-0.536689
4,Healthy,-0.268,-1.28042,-1.30846,-1.04277,-0.768133,-0.799584,0.137262,-0.570038,-1.03825,...,-0.556233,-0.759001,-0.606578,-0.669164,-0.717645,-0.778315,-0.618214,-0.61991,-0.586106,-0.775964


In [73]:
data_healthy_df = X_anova_relief_df.loc[X_anova_relief_df.ID == 'Healthy']
data_asthma_df = X_anova_relief_df.loc[X_anova_relief_df.ID == 'Asthmatic']

In [74]:
data_healthy = data_healthy_df.values[:,1:]
data_asthma = data_asthma_df.values[:, 1:]

In [75]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy)

In [76]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma)

In [77]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)

In [78]:
print(dist)
print(np.linalg.det(healthy_cov))
print(np.linalg.det(asthma_cov))

87.19351047162358
2.3008692548762507e-13
6.307301839056216e-162


# Dataset with linearly correlated features removed

In [21]:
with open('../data/independent_features.pik', "rb") as f:
    independent_features = dill.load(f)

In [22]:
X_no_corr_df = independent_features['X_no_corr_df']

In [23]:
X_no_corr_df.shape

(43, 681)

In [24]:
X_no_corr_df.head()

Unnamed: 0,M151T1,M83T1,M101T1,M46T1,M49T1,M66T1,M62T1,M80T1,M137T1,M135T1,...,M177T24,M179T24_1,M160T24_1,M118T25_1,M149T25_1,M133T25_1,M77T25_3,M107T25,M136T26_1,M135T26_1
0,16.048388,6064.292377,781.993514,802.020931,4694.204132,72927.42,29491.232768,16.024791,1710.311517,610.786947,...,8763.22609,1955.094864,45189.93253,1454.883633,8600.367206,2515.842097,5009.445544,14497.249904,945.040761,6862.351212
1,2966.261917,134774.774889,62570.597315,18605.033364,23763.284044,6268676.0,392830.377721,5189.617648,1061.179026,63189.361143,...,83990.884777,25405.911733,336358.77764,130370.652612,93902.604529,717966.648513,294883.093401,350356.949201,104412.53558,204595.79353
2,8377.442132,54888.286191,7574.412552,24452.053496,8865.881271,812281.6,470305.922774,2883.659359,4927.06092,68984.802047,...,167842.847318,45217.682193,279806.422505,76149.86315,114093.631133,122272.569502,339934.15342,260890.313895,38464.014754,430983.52886
3,37.609095,10440.984838,26.750624,88.92319,1827.264205,44911.65,8166.159143,1374.437507,732.478087,1881.366829,...,104.323716,68.874332,1103.340199,324.393379,193.587466,1167.192877,1612.488181,911.409782,438.807757,5493.967431
4,115.438628,7287.725513,112.578791,28.656514,5843.718655,157313.2,21056.733832,3441.212011,170.831961,284.265511,...,8217.623859,1813.530773,30148.132215,1902.473403,6568.37511,4522.037574,8474.697967,12300.660757,2305.994805,15030.665476


In [25]:
X_scaled = StandardScaler().fit_transform(X_no_corr_df.values)

In [26]:
data_scaled_df = pd.DataFrame(X_scaled, columns = X_no_corr_df.columns)

In [27]:
data_scaled_df.insert(0, 'ID', data.ID.values)
data_scaled_df.insert(1, 'ID_num', data.ID_num.values)

In [28]:
data_scaled_df.head()

Unnamed: 0,ID,ID_num,M151T1,M83T1,M101T1,M46T1,M49T1,M66T1,M62T1,M80T1,...,M177T24,M179T24_1,M160T24_1,M118T25_1,M149T25_1,M133T25_1,M77T25_3,M107T25,M136T26_1,M135T26_1
0,Healthy,0,-0.268167,-0.173282,-0.468532,-0.689375,-0.545787,-0.619796,-0.409468,-0.387656,...,-0.59377,-0.6371,-0.641001,-0.777737,-0.563426,-0.509932,-0.449127,-0.562903,-0.617659,-0.597298
1,Healthy,0,-0.263205,-0.105216,0.307663,0.251702,-0.410207,0.588939,-0.072891,-0.365801,...,0.031987,0.383918,0.488144,1.915872,0.384146,4.395997,0.059717,1.989039,1.890985,0.252684
2,Healthy,0,-0.254105,-0.147462,-0.383205,0.560778,-0.516126,-0.475555,-0.001122,-0.375542,...,0.729482,1.246497,0.268835,0.782965,0.608436,0.311254,0.138799,1.30925,0.292015,1.225839
3,Healthy,0,-0.268131,-0.170967,-0.47802,-0.72707,-0.56617,-0.625262,-0.429223,-0.381917,...,-0.665796,-0.719223,-0.811968,-0.801357,-0.656812,-0.51918,-0.45509,-0.666131,-0.629933,-0.60318
4,Healthy,0,-0.268,-0.172635,-0.476942,-0.730256,-0.537614,-0.603333,-0.417282,-0.373187,...,-0.598309,-0.643263,-0.699333,-0.768384,-0.585998,-0.496175,-0.443044,-0.579593,-0.584662,-0.562185


In [29]:
data_healthy_df = data_scaled_df.loc[data_scaled_df.ID == 'Healthy']
data_asthma_df = data_scaled_df.loc[data_scaled_df.ID == 'Asthmatic']

In [30]:
data_healthy = data_healthy_df.values[:,2:]
data_asthma = data_asthma_df.values[:, 2:]

In [31]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy)

In [32]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma)

In [34]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)

  """


In [35]:
dist

nan

# Relief Methods

In [36]:
data_scaled_df.head()

Unnamed: 0,ID,ID_num,M151T1,M83T1,M101T1,M46T1,M49T1,M66T1,M62T1,M80T1,...,M177T24,M179T24_1,M160T24_1,M118T25_1,M149T25_1,M133T25_1,M77T25_3,M107T25,M136T26_1,M135T26_1
0,Healthy,0,-0.268167,-0.173282,-0.468532,-0.689375,-0.545787,-0.619796,-0.409468,-0.387656,...,-0.59377,-0.6371,-0.641001,-0.777737,-0.563426,-0.509932,-0.449127,-0.562903,-0.617659,-0.597298
1,Healthy,0,-0.263205,-0.105216,0.307663,0.251702,-0.410207,0.588939,-0.072891,-0.365801,...,0.031987,0.383918,0.488144,1.915872,0.384146,4.395997,0.059717,1.989039,1.890985,0.252684
2,Healthy,0,-0.254105,-0.147462,-0.383205,0.560778,-0.516126,-0.475555,-0.001122,-0.375542,...,0.729482,1.246497,0.268835,0.782965,0.608436,0.311254,0.138799,1.30925,0.292015,1.225839
3,Healthy,0,-0.268131,-0.170967,-0.47802,-0.72707,-0.56617,-0.625262,-0.429223,-0.381917,...,-0.665796,-0.719223,-0.811968,-0.801357,-0.656812,-0.51918,-0.45509,-0.666131,-0.629933,-0.60318
4,Healthy,0,-0.268,-0.172635,-0.476942,-0.730256,-0.537614,-0.603333,-0.417282,-0.373187,...,-0.598309,-0.643263,-0.699333,-0.768384,-0.585998,-0.496175,-0.443044,-0.579593,-0.584662,-0.562185


## Relief-F

In [76]:
fs = ReliefF(discrete_threshold = 5, n_jobs=1)
fs.fit(data_scaled_df.values[:,2:].astype(float), data_scaled_df.ID_num.values)

feature_scores = fs.feature_importances_
feature_ids = np.where(feature_scores>=0)[0]
selected_features = np.array(X_no_corr_df.columns[feature_ids]) 

In [77]:
X_reliefF = data_scaled_df.values[:,2:][:,feature_ids]

In [78]:
X_reliefF.shape

(43, 179)

In [79]:
X_reliefF_df = pd.DataFrame(X_reliefF, columns = selected_features)
X_reliefF_df.insert(0, 'ID', data.ID.values)

In [80]:
X_reliefF_df.head()

Unnamed: 0,ID,M151T1,M83T1,M101T1,M46T1,M51T1,M50T1,M69T1,M41T2_1,M56T2_1,...,M86T17_2,M99T17_2,M179T17_1,M154T17_4,M141T19_1,M166T19,M103T19,M123T23_2,M160T24_1,M118T25_1
0,Healthy,-0.268167,-0.173282,-0.468532,-0.689375,-0.397119,-0.504227,-0.259295,-0.515416,-0.806464,...,-0.448508,-0.545108,-0.684569,-0.617682,-0.653828,-0.724431,-0.63664,-0.769704,-0.641001,-0.777737
1,Healthy,-0.263205,-0.105216,0.307663,0.251702,-0.0326679,0.403328,0.19694,2.10068,0.829804,...,2.20794,2.59511,4.54669,5.08838,0.706481,-0.00984325,-0.535609,-0.0779283,0.488144,1.91587
2,Healthy,-0.254105,-0.147462,-0.383205,0.560778,0.117618,-0.138625,0.167859,1.00659,0.757935,...,-0.507757,0.502449,1.47263,-0.557608,0.173148,0.175616,-0.275008,-0.0775667,0.268835,0.782965
3,Healthy,-0.268131,-0.170967,-0.47802,-0.72707,-0.393612,-0.555889,-0.266302,-0.52864,-0.842443,...,-0.682612,-0.695689,-0.596558,-0.620823,-0.6057,-0.854671,-0.661433,-0.830507,-0.811968,-0.801357
4,Healthy,-0.268,-0.172635,-0.476942,-0.730256,-0.384417,-0.485635,-0.266156,-0.461565,-0.811969,...,-0.674445,-0.725629,-0.49071,-0.620428,-0.595307,-0.802005,-0.656975,-0.806074,-0.699333,-0.768384


In [81]:
data_healthy_df = X_reliefF_df.loc[X_reliefF_df.ID == 'Healthy']
data_asthma_df = X_reliefF_df.loc[X_reliefF_df.ID == 'Asthmatic']

In [82]:
data_healthy = data_healthy_df.values[:,1:]
data_asthma = data_asthma_df.values[:, 1:]

In [83]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy)

In [84]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma)

In [85]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)

  """


In [86]:
np.linalg.det(healthy_cov)

0.0

## MultiSURF

In [87]:
fs = MultiSURF(discrete_threshold = 5, n_jobs=1)
fs.fit(data_scaled_df.values[:,2:].astype(float), data_scaled_df.ID_num.values)

feature_scores = fs.feature_importances_
feature_ids = np.where(feature_scores>=0)[0]
selected_features = np.array(X_no_corr_df.columns[feature_ids]) 

In [88]:
X_MultiSURF = data_scaled_df.values[:,2:][:,feature_ids]

In [89]:
X_MultiSURF.shape

(43, 439)

In [90]:
X_MultiSURF_df = pd.DataFrame(X_MultiSURF, columns = selected_features)
X_MultiSURF_df.insert(0, 'ID', data.ID.values)

In [91]:
X_MultiSURF_df.head()

Unnamed: 0,ID,M151T1,M83T1,M101T1,M46T1,M49T1,M66T1,M137T1,M135T1,M113T1,...,M120T23_2,M173T24,M123T24,M179T24_1,M149T25_1,M133T25_1,M77T25_3,M107T25,M136T26_1,M135T26_1
0,Healthy,-0.268167,-0.173282,-0.468532,-0.689375,-0.545787,-0.619796,-0.516503,-0.781463,-0.49384,...,-0.3054,-0.700914,-0.672729,-0.6371,-0.563426,-0.509932,-0.449127,-0.562903,-0.617659,-0.597298
1,Healthy,-0.263205,-0.105216,0.307663,0.251702,-0.410207,0.588939,-0.567528,1.35577,1.59713,...,1.30333,0.416515,2.34364,0.383918,0.384146,4.396,0.0597169,1.98904,1.89098,0.252684
2,Healthy,-0.254105,-0.147462,-0.383205,0.560778,-0.516126,-0.475555,-0.26365,1.5537,2.8474,...,0.263764,0.441958,0.858506,1.2465,0.608436,0.311254,0.138799,1.30925,0.292015,1.22584
3,Healthy,-0.268131,-0.170967,-0.47802,-0.72707,-0.56617,-0.625262,-0.593365,-0.738069,-0.545022,...,-0.42372,-0.733728,-0.786336,-0.719223,-0.656812,-0.51918,-0.45509,-0.666131,-0.629933,-0.60318
4,Healthy,-0.268,-0.172635,-0.476942,-0.730256,-0.537614,-0.603333,-0.637514,-0.792615,-0.445029,...,-0.341951,-0.706817,-0.698413,-0.643263,-0.585998,-0.496175,-0.443044,-0.579593,-0.584662,-0.562185


In [92]:
data_healthy_df = X_MultiSURF_df.loc[X_reliefF_df.ID == 'Healthy']
data_asthma_df = X_MultiSURF_df.loc[X_reliefF_df.ID == 'Asthmatic']

In [93]:
data_healthy = data_healthy_df.values[:,1:]
data_asthma = data_asthma_df.values[:, 1:]

In [94]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy)

In [95]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma)

In [96]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)

  """


In [97]:
dist

nan

## MultiSURFStar

In [98]:
fs = MultiSURFstar(discrete_threshold = 5, n_jobs=1)
fs.fit(data_scaled_df.values[:,2:].astype(float), data_scaled_df.ID_num.values)

feature_scores = fs.feature_importances_
feature_ids = np.where(feature_scores>=0)[0]
selected_features = np.array(X_no_corr_df.columns[feature_ids]) 

In [99]:
X_MultiSURFStar = data_scaled_df.values[:,2:][:,feature_ids]

In [100]:
X_MultiSURFStar.shape

(43, 10)

In [101]:
X_MultiSURFStar_df = pd.DataFrame(X_MultiSURFStar, columns = selected_features)
X_MultiSURFStar_df.insert(0, 'ID', data.ID.values)

In [102]:
X_MultiSURFStar_df.head()

Unnamed: 0,ID,M98T3_1,M68T4,M84T5_4,M97T5_4,M68T5,M67T5_2,M70T6_1,M82T9_3,M82T9_1,M109T12_2
0,Healthy,-0.698793,-0.398447,-0.830918,-0.802285,-0.627699,-0.76541,-0.677485,-0.82505,-0.9305,-0.59916
1,Healthy,1.72997,2.44104,2.13364,2.40625,2.95831,1.79988,0.776644,2.27876,0.792997,2.50791
2,Healthy,1.70043,1.60877,1.38046,2.33114,1.88355,1.80583,1.79879,0.417901,0.490392,0.766022
3,Healthy,-0.413427,-0.67994,-0.828038,-0.820327,-0.586846,-0.843057,-0.989364,-0.930405,-0.932132,-0.811215
4,Healthy,-0.720106,-0.631427,-0.842626,-0.777552,-0.620321,-1.01032,-0.959529,-0.896358,-1.02564,-0.789324


In [103]:
data_healthy_df = X_MultiSURFStar_df.loc[X_MultiSURFStar_df.ID == 'Healthy']
data_asthma_df = X_MultiSURFStar_df.loc[X_MultiSURFStar_df.ID == 'Asthmatic']

In [104]:
data_healthy = data_healthy_df.values[:,1:]
data_asthma = data_asthma_df.values[:, 1:]

In [105]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy)

In [106]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma)

In [107]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)

In [108]:
dist

1.0077670608323313

# Univariate Statistical Feature Selection

## Anova

In [293]:
f,p = f_classif(data_scaled_df.values[:,2:].astype(float), data.ID_num.values)
feature_ids = np.where(p<=0.05)[0]


selected_features = np.array(X_no_corr_df.columns[feature_ids])

X_anova = data_scaled_df.values[:,2:][:,feature_ids]

In [294]:
X_anova.shape

(43, 23)

In [295]:
X_anova_df = pd.DataFrame(X_anova, columns = selected_features)
X_anova_df.insert(0, 'ID', data.ID.values)

In [296]:
X_anova_df.head()

Unnamed: 0,ID,M151T1,M67T2_1,M90T2_2,M72T2_2,M82T2_2,M95T2,M97T3_1,M77T3_1,M84T4_1,...,M127T6,M39T7_1,M63T7_2,M77T7_2,M108T7,M58T7,M94T11_1,M93T11_1,M38T11,M110T12_2
0,Healthy,-0.268167,-0.459512,-0.482348,-0.645626,-0.855105,-0.942973,-0.0778075,-0.655164,-0.874023,...,-0.562474,-0.536436,-0.620496,-0.549814,-0.541696,-0.532667,-0.584152,-0.462755,-0.563194,-0.756397
1,Healthy,-0.263205,0.597175,1.88202,-0.944398,2.18404,0.444226,0.299322,1.38727,0.839667,...,-0.209553,1.61849,2.13032,0.668414,1.5753,0.913865,-0.550068,-0.581811,-0.387506,2.78272
2,Healthy,-0.254105,2.10178,1.31152,-0.148172,2.09556,2.53259,0.191514,0.633136,1.78128,...,-0.391824,0.794053,1.26737,0.124668,0.811305,0.631648,-0.549622,-0.538452,-0.0809946,0.878527
3,Healthy,-0.268131,-0.468733,-0.572338,-1.25737,-0.917285,-1.02991,0.131019,-0.653796,-1.02205,...,-0.608561,-0.556864,-0.648659,-0.559228,-0.579382,-0.55209,-0.723417,-0.531289,-0.57296,-0.536689
4,Healthy,-0.268,-0.456096,-0.55483,-1.28042,-1.30846,-1.04277,0.137262,-0.570038,-1.03825,...,-0.54252,-0.543088,-0.63492,-0.500634,-0.556233,-0.531873,-0.759001,-0.606578,-0.618214,-0.775964


In [297]:
data_healthy_df = X_anova_df.loc[X_anova_df.ID == 'Healthy']
data_asthma_df = X_anova_df.loc[X_anova_df.ID == 'Asthmatic']

In [298]:
data_healthy = data_healthy_df.values[:,1:]
data_asthma = data_asthma_df.values[:, 1:]

In [299]:
healthy_mean, healthy_cov = do_factor_analysis(data_healthy)

In [300]:
asthma_mean, asthma_cov = do_factor_analysis(data_asthma)

In [301]:
dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)

In [302]:
dist

69.33796335919133

In [303]:
np.linalg.det(healthy_cov)

1.0328911165212588e-09

In [304]:
np.linalg.det(asthma_cov)

1.9571702616627723e-126

# Combinations

## Anova + Relief-F

In [240]:
X_anova.shape

(43, 23)

In [241]:
fs = ReliefF(discrete_threshold = 5, n_jobs=1)
fs.fit(X_anova.astype(float), data.ID_num.values)

feature_scores = fs.feature_importances_
feature_ids = np.where(feature_scores>=0)[0]
selected_features = selected_features[feature_ids]

In [242]:
X_ano_reliefF = X_anova[:,feature_ids]

In [243]:
X_ano_reliefF.shape

(43, 18)

In [244]:
X_anova_relief_df = pd.DataFrame(X_ano_reliefF, columns = selected_features)
X_anova_relief_df.insert(0, 'ID', data.ID.values)

In [245]:
X_anova_relief_df.head()

Unnamed: 0,ID,M151T1,M72T2_2,M82T2_2,M95T2,M97T3_1,M77T3_1,M84T4_1,M100T4_1,M138T4,M84T6_1,M98T6_2,M127T6,M63T7_2,M108T7,M94T11_1,M93T11_1,M38T11,M110T12_2
0,Healthy,-0.268167,-0.645626,-0.855105,-0.942973,-0.0778075,-0.655164,-0.874023,-0.531849,-0.474613,-0.869495,-0.712758,-0.562474,-0.620496,-0.541696,-0.584152,-0.462755,-0.563194,-0.756397
1,Healthy,-0.263205,-0.944398,2.18404,0.444226,0.299322,1.38727,0.839667,2.19339,3.14949,0.746347,-0.42891,-0.209553,2.13032,1.5753,-0.550068,-0.581811,-0.387506,2.78272
2,Healthy,-0.254105,-0.148172,2.09556,2.53259,0.191514,0.633136,1.78128,1.65591,2.52769,0.131106,-0.495078,-0.391824,1.26737,0.811305,-0.549622,-0.538452,-0.0809946,0.878527
3,Healthy,-0.268131,-1.25737,-0.917285,-1.02991,0.131019,-0.653796,-1.02205,-0.806302,-0.670972,-0.807571,-0.644416,-0.608561,-0.648659,-0.579382,-0.723417,-0.531289,-0.57296,-0.536689
4,Healthy,-0.268,-1.28042,-1.30846,-1.04277,0.137262,-0.570038,-1.03825,-1.07789,-0.606536,-0.378076,-0.162183,-0.54252,-0.63492,-0.556233,-0.759001,-0.606578,-0.618214,-0.775964


In [246]:
data_healthy_df = X_anova_relief_df.loc[X_anova_relief_df.ID == 'Healthy']
data_asthma_df = X_anova_relief_df.loc[X_anova_relief_df.ID == 'Asthmatic']

In [247]:
data_healthy = data_healthy_df.values[:,1:]
data_asthma = data_asthma_df.values[:, 1:]

In [281]:
data_scaled_df.head()

Unnamed: 0,ID,ID_num,M151T1,M83T1,M101T1,M46T1,M49T1,M66T1,M62T1,M80T1,...,M177T24,M179T24_1,M160T24_1,M118T25_1,M149T25_1,M133T25_1,M77T25_3,M107T25,M136T26_1,M135T26_1
0,Healthy,0,-0.268167,-0.173282,-0.468532,-0.689375,-0.545787,-0.619796,-0.409468,-0.387656,...,-0.59377,-0.6371,-0.641001,-0.777737,-0.563426,-0.509932,-0.449127,-0.562903,-0.617659,-0.597298
1,Healthy,0,-0.263205,-0.105216,0.307663,0.251702,-0.410207,0.588939,-0.072891,-0.365801,...,0.031987,0.383918,0.488144,1.915872,0.384146,4.395997,0.059717,1.989039,1.890985,0.252684
2,Healthy,0,-0.254105,-0.147462,-0.383205,0.560778,-0.516126,-0.475555,-0.001122,-0.375542,...,0.729482,1.246497,0.268835,0.782965,0.608436,0.311254,0.138799,1.30925,0.292015,1.225839
3,Healthy,0,-0.268131,-0.170967,-0.47802,-0.72707,-0.56617,-0.625262,-0.429223,-0.381917,...,-0.665796,-0.719223,-0.811968,-0.801357,-0.656812,-0.51918,-0.45509,-0.666131,-0.629933,-0.60318
4,Healthy,0,-0.268,-0.172635,-0.476942,-0.730256,-0.537614,-0.603333,-0.417282,-0.373187,...,-0.598309,-0.643263,-0.699333,-0.768384,-0.585998,-0.496175,-0.443044,-0.579593,-0.584662,-0.562185


In [286]:
import random

In [287]:
random_feat_ids = list(range(681))

In [288]:
random.shuffle(random_feat_ids)

In [289]:
data_scaled_df.head()

Unnamed: 0,ID,ID_num,M151T1,M83T1,M101T1,M46T1,M49T1,M66T1,M62T1,M80T1,...,M177T24,M179T24_1,M160T24_1,M118T25_1,M149T25_1,M133T25_1,M77T25_3,M107T25,M136T26_1,M135T26_1
0,Healthy,0,-0.268167,-0.173282,-0.468532,-0.689375,-0.545787,-0.619796,-0.409468,-0.387656,...,-0.59377,-0.6371,-0.641001,-0.777737,-0.563426,-0.509932,-0.449127,-0.562903,-0.617659,-0.597298
1,Healthy,0,-0.263205,-0.105216,0.307663,0.251702,-0.410207,0.588939,-0.072891,-0.365801,...,0.031987,0.383918,0.488144,1.915872,0.384146,4.395997,0.059717,1.989039,1.890985,0.252684
2,Healthy,0,-0.254105,-0.147462,-0.383205,0.560778,-0.516126,-0.475555,-0.001122,-0.375542,...,0.729482,1.246497,0.268835,0.782965,0.608436,0.311254,0.138799,1.30925,0.292015,1.225839
3,Healthy,0,-0.268131,-0.170967,-0.47802,-0.72707,-0.56617,-0.625262,-0.429223,-0.381917,...,-0.665796,-0.719223,-0.811968,-0.801357,-0.656812,-0.51918,-0.45509,-0.666131,-0.629933,-0.60318
4,Healthy,0,-0.268,-0.172635,-0.476942,-0.730256,-0.537614,-0.603333,-0.417282,-0.373187,...,-0.598309,-0.643263,-0.699333,-0.768384,-0.585998,-0.496175,-0.443044,-0.579593,-0.584662,-0.562185


In [292]:
data_healthy = data_scaled_df.loc[data_scaled_df.ID == 'Healthy'].values[:,2:]
data_asthma = data_scaled_df.loc[data_scaled_df.ID == 'Asthmatic'].values[:,2:]

healthy_mean, healthy_cov = do_factor_analysis(data_healthy[:,random_feat_ids[:30]])
asthma_mean, asthma_cov = do_factor_analysis(data_asthma[:,random_feat_ids[:30]])

dist, t1, t2 = bhatt_dist(healthy_mean, healthy_cov, asthma_mean, asthma_cov)
print(dist)
print(np.linalg.det(healthy_cov))
print(np.linalg.det(asthma_cov))

130.1534765457679
1.120548549465393e-58
2.2573361229095305e-211


In [209]:
t2

inf

In [195]:
np.linalg.det(asthma_cov)

0.0