In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, LeaveOneGroupOut
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegressionCV, RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import IterativeImputer

from joblib import Parallel, delayed
import seaborn as sns

from hidimstat import CPI

In [None]:
features_all_keys = pd.read_csv('./features_all_keys.csv')
features_all_keys = [k for k in features_all_keys['subject']]
features_all_keys[:3]

In [None]:
participants_fname = './participants.tsv'
subject_df = pd.read_csv(participants_fname, delimiter='\t')
subject_df['participant_id'] = subject_df['participant_id'].str.replace('sub-', '')
subject_df = subject_df.set_index('participant_id') # creates index (supprime la colonne participant_id et la met en index)
subject_df = subject_df.loc[features_all_keys] # dataframe in correct order (without the patient with no eeg)
subject_df.head()
subject_converter_df = subject_df.dropna(subset=['Converters']) # create new df use JUST for STATS 
subject_converter_df.head(140)
feature_converter = subject_df['Converters'].fillna(-1)

In [None]:
sum(subject_converter_df[['group', 'sex', 'site', 'Edu_years', 'MMSE']].isna().sum(axis=1)>=1)

sum(subject_converter_df[['Edu_years']].isna().sum(axis=1)>=1)

sum(subject_converter_df[['MMSE']].isna().sum(axis=1)>=1)

In [None]:
y = feature_converter[feature_converter!=-1]

In [None]:
cv_preds_covs_df = pd.read_csv('./cv_preds_covs.csv')
cv_preds_psd_df = pd.read_csv('./cv_preds_psd.csv').drop(labels=['fold_idx', 'repeat', 'sample_index'], axis=1)
cv_preds_wpli_df = pd.read_csv('./cv_preds_wpli.csv').drop(labels=['fold_idx', 'repeat', 'sample_index'], axis=1)
cv_preds_rplain_df = pd.read_csv('./cv_preds_rplain.csv').drop(labels=['fold_idx', 'repeat', 'sample_index'], axis=1)
cv_preds_mri_df = pd.read_csv('./cv_preds_mri.csv').drop(labels=['fold_idx', 'repeat', 'sample_index'], axis=1)
cv_preds_site_df = pd.read_csv('./cv_preds_site.csv').drop(labels=['fold_idx', 'repeat', 'sample_index'], axis=1)

In [None]:
cv_preds_mri_df = pd.read_csv('./cv_preds_mri.csv')

In [None]:
cv_preds_mri_df.sample_index.unique().shape

In [None]:
subject_converter_df.index[
    np.unique(cv_preds_mri_df.sample_index[np.where(cv_preds_mri_df['mri'].isna())[0]])
]

In [None]:
cv_preds_mri_df = pd.read_csv('./cv_preds_mri.csv').drop(labels=['fold_idx', 'repeat', 'sample_index'], axis=1)


## Site Data

In [None]:
X_df_site = subject_converter_df[['age', 'sex', 'Edu_years']]
X_df_site['sex'] = X_df_site['sex'] == 'M' # tf the cat var into bool
y_site = subject_converter_df['site'] == 'CBU'# tf site in bool
X_df_site['Edu_years'] = X_df_site['Edu_years'].fillna(X_df_site['Edu_years'].mean()) # 10 missing data, input mean over all sub
#X_df_site.info()

In [None]:
subject_converter_df['MMSE'].values

In [None]:
def string_to_array(s):
    return np.fromstring(s.strip('[]'), sep=' ', dtype=int)

In [None]:
n_splits = 10 # the number of CV splits
n_repeat = 10 # the number of randomized repetitions to get a nicer distribution
# Consider putting n_repeats to 1 for the beginning.
df_cv_site = pd.concat([
    pd.DataFrame(
        list(StratifiedKFold(n_splits=n_splits, random_state=ii * 7, shuffle=True).split(X_df_site, y_site)),
        columns=['train', 'test']).assign(rep=ii, fold_idx=range(n_splits)) for ii in range(n_repeat)]
)    

if True: # write/read test
    df_cv_site.to_csv('df_cv_site.csv', index=False)
    df_cv_site = pd.read_csv(
        'df_cv_site.csv', dtype=[('train', np.ndarray), 
                            ('test', np.ndarray),
                            ('rep', int),
                            ('fold_idx', int)]
    )
    df_cv_site['train'] = df_cv_site['train'].apply(string_to_array)
    df_cv_site['test'] = df_cv_site['test'].apply(string_to_array)
    

In [None]:
n_splits = 10 # the number of CV splits
n_repeat = 10 # the number of randomized repetitions to get a nicer distribution
# Consider putting n_repeats to 1 for the beginning.
df_cv = pd.concat([
    pd.DataFrame(
        list(StratifiedKFold(n_splits=n_splits, random_state=ii * 7, shuffle=True).split(X_df_site, y)),
        columns=['train', 'test']).assign(rep=ii, fold_idx=range(n_splits)) for ii in range(n_repeat)]
)    

if True: # write/read test
    df_cv.to_csv('df_cv_cases.csv', index=False)
    df_cv = pd.read_csv(
        'df_cv_cases.csv', dtype=[('train', np.ndarray), 
                            ('test', np.ndarray),
                            ('rep', int),
                            ('fold_idx', int)]
    )
    df_cv['train'] = df_cv['train'].apply(string_to_array)
    df_cv['test'] = df_cv['test'].apply(string_to_array)
    

# ML

In [None]:
df_cv.iloc[-2, :].test

In [None]:
cv_preds_covs_df['sample_index']
# cv_preds_wpli_df

In [None]:
X_stacked = pd.concat([cv_preds_covs_df, cv_preds_psd_df, cv_preds_wpli_df, cv_preds_rplain_df, cv_preds_mri_df, cv_preds_site_df]
, axis=1, join='outer')[['fold_idx', 'repeat', 'sample_index', 'covs', 'psd', 'rplain', 'wpli', 'mri']]
# X_stacked.to_csv('./output/X_stacked.csv')
X_stacked

In [None]:
baseline = ['site', 'edu', 'age', 'sex', 'mmse']

feat_combinations = {
    'Site': ['site'],
    'MMSE': ['mmse'],
    'Age': ['age'],
    'Sex': ['sex'],
    'Covariance': ['covs'],
    'Pow. Env.': ['rplain'],
    'dwPLI': ['wpli'],
    'MRI': ['mri'],
    'Full Model': baseline + ['covs', 'rplain', 'wpli', 'mri']
}

In [None]:
subject_converter_df.columns

In [None]:
subject_converter_df['Edu_years']

In [None]:
pd.options.mode.copy_on_write = True

In [None]:
recompute = True
pred_mode = 'refit'
if recompute:
    model_grid_search = make_pipeline(
        IterativeImputer(),
        StandardScaler(),
        LogisticRegressionCV(Cs=np.logspace(-3, 5, 50), cv=10, scoring="neg_log_loss")
    )
    # stacking with RF not fine tuned for all combinations
    print('Recomputing stacking models ...')
    results = list()
    for name, features in feat_combinations.items():
        for rep in range(10):
                    
            X_ = X_stacked.query(f'repeat=={rep}')
            sample_idx = X_['sample_index'].values
            X_['mmse'] = subject_converter_df['MMSE'].values[sample_idx]
            X_['site'] = (subject_converter_df['site']
                          .map({'CTB': 0, 'CBU': 1})
                          .values[sample_idx])
            X_['edu'] = subject_converter_df['Edu_years'].values[sample_idx]
            X_['age'] = subject_converter_df['age'].values[sample_idx]
            X_['sex'] = subject_converter_df['sex'].map({'M': 0, 'F': 1}).values[sample_idx]
            if features in [['covs'], ['rplain'], ['wpli'], ['mri']] and pred_mode == 'raw':
                splitter = LeaveOneGroupOut().split(
                    X_[features], y.values[sample_idx], groups=X_['fold_idx']
                )
                res = list()
                for train, test in splitter:
                    auc = roc_auc_score(
                        y_true= y.values[sample_idx][test], y_score=X_[features].values[test]
                    )
                    res.append(auc)

            res = cross_val_score(
                X=X_[features],
                groups=X_['fold_idx'],
                y=y.values[sample_idx],
                estimator=model_grid_search,
                scoring='roc_auc',
                cv=LeaveOneGroupOut(),
                n_jobs=4,
            )
            this_result = pd.DataFrame(dict(scores=res))
            this_result['rep'] = rep
            this_result['split'] = range(10)
            this_result['variables'] = name
            results.append(this_result)
            print(f'{name} auc: {res.mean():.3f}+/-{res.std():.3f}')
    results_df = pd.concat(results)
    results_df.to_csv('./stacking_variables_scoring_linear.csv')
else:
    print('Loading precomputed data ...')

results_df_linear = pd.read_csv('./stacking_variables_scoring_linear.csv')

# Some general plotting config

In [None]:
import utils as ut

from utils import CUSTOM_COLORS as colors
from utils import compute_corrected_ttest

plt.rc('font', family='sans-serif', )
sns.set_context('paper', rc={"lines.linewidth": 2,})

SMALL_SIZE = 10
MEDIUM_SIZE = 13
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)

In [None]:
def compare_models(df, model_a, model_b):
    """
    Computes split-wise differences between two models.
    
    Args:
    df: DataFrame with 'scores', 'model', and 'split' columns.
    model_a: Name of the first model.
    model_b: Name of the second model.
    
    Returns:
    A Series with the score differences for each split.
    """
    
    # Pivot the DataFrame to have models as columns and splits as rows
    df_pivot = df.pivot(index='cv_idx', columns='variables', values='scores')
    
    # Calculate the difference between the two models
    diff = df_pivot[model_a] - df_pivot[model_b] 
    return diff

In [None]:

def compute_corrected_ttest(differences, df):
    """Computes right-tailed paired t-test with corrected variance.

    Parameters
    ----------
    differences : array-like of shape (n_samples,)
        Vector containing the differences in the score metrics of two models.
    df : int
        Degrees of freedom.
    n_train : int
        Number of samples in the training set.
    n_test : int
        Number of samples in the testing set.

    Returns
    -------
    t_stat : float
        Variance-corrected t-statistic.
    p_val : float
        Variance-corrected p-value.
    """
    mean = np.mean(differences)
    std = corrected_std(differences)
    t_stat = mean / std
    p_val = stats.t.sf(t_stat, df)  # right-tailed t-test
    return t_stat, p_val


In [None]:
def corrected_std(differences):
    """Corrects standard deviation using Nadeau and Bengio's approach.

    Parameters
    ----------
    differences : ndarray of shape (n_samples,)
        Vector containing the differences in the score metrics of two models.
    n_train : int
        Number of samples in the training set.
    n_test : int
        Number of samples in the testing set.

    Returns
    -------
    corrected_std : float
        Variance-corrected standard deviation of the set of differences.
    """
    # kr = k times r, r times repeated k-fold crossvalidation,
    # kr equals the number of times the model was evaluated
    kr = len(differences)
    corrected_var = np.var(differences, ddof=1) * (1 / kr + 0.1 / 0.9)
    corrected_std = np.sqrt(corrected_var)
    return corrected_std

In [None]:
results_df = results_df_linear

In [None]:
results_df['cv_idx'] = results_df.rep.astype(str) + results_df.split.astype(str)

In [None]:
estimator_lr_cv = make_pipeline(
    IterativeImputer(),
    StandardScaler(),
    LogisticRegressionCV(Cs=np.logspace(-3, 5, 50), cv=10,
                         scoring="neg_log_loss")
)
imputation_rr_cv = make_pipeline(
    IterativeImputer(),
    StandardScaler(),
    RidgeCV(alphas=np.logspace(-3, 5, 50))
)

In [None]:
X_stacked

In [None]:
variables = baseline + ['wpli', 'rplain', 'covs', 'mri']
variables

In [None]:
def run_cpi(rep, estimator, imputation_model, variables, impute=True):
    X_sel = X_stacked.query(f'repeat=={rep}').reset_index()
    groups = X_sel['fold_idx'].values
    y_sel = y.values[X_sel['sample_index'].values]
    assert np.unique(X_sel['sample_index'].values).shape[0] == len(y)
    sample_idx = X_sel['sample_index'].values
    if 'mmse' in variables:
        mmse = subject_converter_df['MMSE'].values[sample_idx]
        X_sel['mmse'] = mmse
    if 'site' in variables:
        X_sel['site'] = (
            subject_converter_df['site']
            .map({'CTB': 0, 'CBU': 1})
            .values[sample_idx]
        )
    if 'edu' in variables: 
        X_sel['edu'] = subject_converter_df['Edu_years'].values[sample_idx]
    if 'age' in variables:
        X_sel['age'] = subject_converter_df['age'].values[sample_idx]
    if 'sex' in variables:
        X_sel['sex'] = subject_converter_df['sex'].map({'M': 0, 'F': 1}).values[sample_idx]
    
    X_sel = X_sel[variables]
    logo = LeaveOneGroupOut()
    logo_splits = list(logo.split(
        X=X_sel, groups=groups
    ))
    loss_list_cv = list() 

    for ii, (train, test) in enumerate(logo_splits):

        imputer = IterativeImputer()
        if impute:
            X_imp = imputer.fit_transform(X_sel.values[train])
        else:
            X_imp = X_sel.values[train]

        estimator.fit(X_imp, y_sel[train])
        
        print(
            roc_auc_score(
                y_true=y_sel[test],
                y_score=estimator.predict_proba(X_sel.values[test])[:, 1]
            )
        )
    
        cpi = CPI(
            estimator=estimator,
            method='predict_proba',
            random_state=42,
            n_permutations=500,
            imputation_model=clone(imputation_model),
            loss=log_loss
        )
    
        cpi.fit(X_imp, y_sel[train])

        if impute:
            X_imp_test = imputer.transform(X_sel.values[test])
        else:
            X_sel.values[test]

        loss_result = cpi.score(X_imp_test, y_sel[test])
    
        loss_result_df = pd.DataFrame(loss_result['importance']).T
        loss_result_df['cv_fold'] = ii
        loss_result_df['rep'] = rep
        loss_list_cv.append(
            loss_result_df
        )
    return loss_list_cv

In [None]:
recompute = True
if recompute:
    loss_list_cv_ = Parallel(n_jobs=1)(delayed(run_cpi)(
        rep=ii,
        estimator=estimator_lr_cv,
        imputation_model=imputation_rr_cv,
        variables=variables)
        for ii in range(10)
    )
    loss_list_cv_lm = sum(loss_list_cv_, [])
    loss_list_cv_lm_df = pd.concat([pd.DataFrame(x) for x in loss_list_cv_lm])
    loss_list_cv_lm_df.columns = variables + ['cv_fold', 'rep']
    loss_list_cv_lm_df.to_csv('cpi_linear_models.csv')
else:
    loss_list_cv_lm_df = pd.read_csv('cpi_linear_models.csv')

In [None]:
loss_list_cv_lm_df

In [None]:
def make_plot(results_df, loss_list_cv_df, variables):
    
    fig, axes = plt.subplot_mosaic(
        [['A', 'B']],
        figsize=( 8, 3.5), 
        width_ratios=[1, 1.25],
        sharex=False,
        sharey=False,
        layout="constrained"
    )
    
    sorted_vars = results_df.groupby('variables')['scores'].median().sort_values(ascending = False).index
    sorted_df = results_df.set_index('variables').loc[sorted_vars].reset_index()
    
    axes['A'].axvline(0.5, c='black', linestyle='--')
    print(sorted_df.groupby('variables').scores.agg(['mean', 'std']).round(3))
 
    sns.pointplot(
        sorted_df,
        x="scores", 
        y="variables",
        palette='viridis',
        hue='variables',
        errorbar=('ci', 95), 
        estimator='median',
        marker='o',
        alpha=1,
        linestyle='',
        markersize=8,
        markerfacecolor='white',
        markeredgecolor=None,
        capsize=.33,
        linewidth=2,
        ax=axes['A']
    ) 
    axes['A'].set_ylabel('')
    axes['A'].set_xlabel('AUC')
    axes['A'].set_xlim(0.4, 1.0)
    axes['A'].annotate('a', xy=(-0.35, 0.99), xycoords='axes fraction', fontsize=BIGGER_SIZE, fontweight='bold')
    axes['A'].set_title('Marginal Models VS Full Model', fontsize=MEDIUM_SIZE, x=0.44)
    
    
    stats_res = list()
    for ii, column in enumerate(variables):
        tstat, pval = compute_corrected_ttest(loss_list_cv_df[column], len(loss_list_cv_df)-1)
        stats_res.append({'stat': tstat , 'pval': pval, 'variable': column})
    stats_res = pd.DataFrame(stats_res)
    stats_res['variable'] = stats_res['variable'].map({
        'site': 'Site',
        'covs': 'Cov.',
        'wpli': 'dwPLI',
        'rplain': 'P. Env.',
        'mri': 'MRI',
        'mmse': 'MMSE',
        'age': 'Age',
        'sex': 'Sex',
        'edu': 'Edu.'
    })
    stats_res = stats_res.set_index('variable')
    print(stats_res.round(3))

    loss_long_df = pd.melt(loss_list_cv_df, id_vars=['rep', 'cv_fold'], value_vars=variables)
    loss_long_df['variable'] = loss_long_df['variable'].map({
        'site': 'Site',
        'covs': 'Cov.',
        'wpli': 'dwPLI',
        'rplain': 'P. Env.',
        'mri': 'MRI',
        'mmse': 'MMSE',
        'age': 'Age',
        'sex': 'Sex',
        'edu': 'Edu.'
    })

    loss_sorted = loss_long_df.groupby('variable')['value'].median().sort_values(ascending=False).index
    
    loss_long_df_sorted = loss_long_df.set_index('variable').loc[loss_sorted].reset_index()


    axes['B'].axvline(0, c='black', linestyle='--')
    
    sns.pointplot(
        loss_long_df_sorted,
        x="value", 
        y="variable",
        hue='variable',
        palette='viridis',
        errorbar=('ci', 95), 
        estimator='median',
        marker='o',
        alpha=1,
        linestyle='',
        markersize=8,
        markerfacecolor='white',
        markeredgecolor=None,
        capsize=.33,
        linewidth=2,
        ax=axes['B']
    ) 
    axes['B'].set_ylabel('')
    axes['B'].set_xlabel(r'Importance ($\Delta$ loss)')
    axes['B'].set_xlim(-0.05, 0.1)
    
    axes['B'].annotate('b', xy=(-0.25, 0.99), xycoords='axes fraction', fontsize=BIGGER_SIZE, fontweight='bold')
    axes['B'].set_title('Variable Importance (Full Model)', fontsize=MEDIUM_SIZE, x=0.34)
    
    annot_df = stats_res.loc[loss_sorted]
    for ii, (name, row) in enumerate(annot_df.iterrows()):
        pval = row.pval
        label = r'$p=%0.3f$' % pval
        if pval < 0.01 and row.stat > 0:
            label += r'$^{\ast\ast}$'
        elif pval < 0.05 and row.stat > 0:
            label += r'$^{\ast}$'
        axes['B'].annotate(label, xy=(-0.04, ii + 0.14),
                           color='purple' if pval < 0.05 and row.stat > 0 else 'black')

    sns.despine(trim=True, offset=2)
    return fig, axes

In [None]:
fig, axes = make_plot(results_df=results_df_linear, loss_list_cv_df=loss_list_cv_lm_df, variables=variables)
for panel in ['A', 'B']: 
    axes[panel].set_yticklabels(
        [r'MEG$_{(Cov.)}$' if (ll.get_text() == 'Cov.' or ll.get_text() == 'Covariance')
         else ll for ll in axes[panel].get_yticklabels()]
    )
    axes[panel].set_yticklabels(
        [r'MEG$_{(dwPLI.)}$' if ll.get_text() == 'dwPLI' else ll for ll in axes[panel].get_yticklabels()]
    )
    axes[panel].set_yticklabels(
        [r'MEG$_{(P. Env.)}$' if (ll.get_text() == 'P. Env.' or ll.get_text() == 'Pow. Env.')
         else ll for ll in axes[panel].get_yticklabels()]
    )

fig.savefig('linear_model_comp.png', dpi=300)
fig.savefig('linear_model_comp.pdf', dpi=300)