In [None]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

import h5io
import pandas as pd
import pyriemann
from meeglet import define_frequencies

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import coffeine

from sklearn.linear_model import RidgeClassifierCV

# Import features

In [None]:
features_CBU = h5io.read_hdf5('<ADD YOUR PATH HERE>/meeglet_CBU_2023-06-22_10-06.h5')
features_CTB1 = h5io.read_hdf5('<ADD YOUR PATH HERE>/meeglet_CTB_2023-06-22_10-46.h5')
features_CTB2 = h5io.read_hdf5('<ADD YOUR PATH HERE>/meeglet_CTB_2023-06-23_11-04.h5')

In [None]:
# merge 3 hdf5 files (key: patient, value : tuple cov, pow)
features_all = features_CTB1 | features_CTB2 | features_CBU

In [None]:
participants_fname = './participants.tsv'
subject_df = pd.read_csv(participants_fname, delimiter='\t')
subject_df['participant_id'] = subject_df['participant_id'].str.replace('sub-', '')
# creates index (supprime la colonne participant_id et la met en index)
subject_df = subject_df.set_index('participant_id')
subject_df = subject_df.loc[features_all.keys()] 
subject_converter_df = subject_df.dropna(subset=['Converters']) 
feature_converter = subject_df['Converters'].fillna(-1)

In [None]:
foi = define_frequencies(foi_start=1, foi_end=64, bw_oct=0.35, delta_oct=0.05)[0]

In [None]:
def make_coffeine_data_frame(
        C: np.ndarray,
        names = None
        ) -> pd.DataFrame:
    """Put covariances in coffeine Data Frame. Copied from coffeine.

    Parameters
    ----------
    C : np.ndarray, shape(n_obs, n_frequencies, n_channels, n_channels)
        A 2D collection of symmetric matrices. First dimension: samples.
        Second dimension: batches within observations (e.g. frequencies).
    names : dict or list-like, defaults to None
        A descriptor for the second dimension of `C`. It is used to make
        the columns of the coffeine Data Frame

    Returns
    -------
    C_df : pd.DataFrame
        The DataFrame of object type with lists of covariances accessible
        as columns.
    """
    if C.ndim != 4:
        raise ValueError(
            f'Expected input should have 4 dimensions, not {C.ndim}'
        )
    if C.shape[-1] != C.shape[-2]:
        raise ValueError(
            'The 2nd last dimensions should be the same. '
            f'You provided: {C.shape}.'
        )
    names_ = None
    if names is None:
        names_ = [f'c{cc}' for cc in range(C.shape[1])]
    else:
        names_ = names

    C_df = pd.DataFrame(
        {name: list(C[:, ii]) for ii, name in enumerate(names_)}
    )
    return C_df

In [None]:
# 0 cov, 1 pow, 2 csd, 3, 4, 5 dwpli, 6 rplain
X_dwpli = np.array([features_all[subject][5] for subject in features_all]) 

In [None]:
X_dwpli = np.nan_to_num(X_dwpli, nan = 1)
X_dwpli= X_dwpli[feature_converter!=-1]
X_dwpli=X_dwpli[:,:,:,::4] # divide nb of freq by 4 for saving computation resources
X_dwpli.shape

In [None]:
X_covs = np.array([features_all[subject][0] for subject in features_all])
X_covs= X_covs[feature_converter!=-1]
X_covs=X_covs[:,:,:,::4] # divide nb of freq by 4 for saving computation resources
X_covs.shape

In [None]:
pow = np.array([features_all[subject][1] for subject in features_all]) # 0 cov, 1 pow, 2 csd
pow.shape

In [None]:
X_rplain = np.array([features_all[subject][6] for subject in features_all]) # 0 cov, 1 pow, 2 csd
X_rplain= X_rplain[feature_converter!=-1]
X_rplain=X_rplain[:,:,:,::4] # divide nb of freq by 4
X_rplain.shape

In [None]:
del features_all  # save memory

In [None]:
X_df_covs = make_coffeine_data_frame(np.transpose(X_covs, (0, 3, 1, 2)))

In [None]:
X_df_dwpli = make_coffeine_data_frame(
    np.array([pyriemann.utils.base.nearest_sym_pos_def(x.T) for x in X_dwpli.T]).transpose((1, 0, 2, 3))
)

In [None]:
X_df_rplain = make_coffeine_data_frame(
    np.array([pyriemann.utils.base.nearest_sym_pos_def(x.T) for x in X_rplain.T]).transpose((1, 0, 2, 3))
)

In [None]:
y = feature_converter[feature_converter!=-1]
y

In [None]:
def string_to_array(s):
    return np.fromstring(s.strip('[]'), sep=' ', dtype=int)

In [None]:
n_splits = 10 # the number of CV splits
n_repeat = 10 # the number of randomized repetitions to get a nicer distribution
# Consider putting n_repeats to 1 for the beginning.
df_cv = pd.concat([
    pd.DataFrame(
        list(StratifiedKFold(n_splits=n_splits, random_state=ii * 7, shuffle=True).split(X_df_rplain, y)),
        columns=['train', 'test']).assign(rep=ii, fold_idx=range(n_splits)) for ii in range(n_repeat)]
)    

if True: # write/read test
    df_cv.to_csv('df_cv.csv', index=False)
    df_cv = pd.read_csv(
        'df_cv.csv', dtype=[('train', np.ndarray), 
                            ('test', np.ndarray),
                            ('rep', int),
                            ('fold_idx', int)]
    )
    df_cv['train'] = df_cv['train'].apply(string_to_array)
    df_cv['test'] = df_cv['test'].apply(string_to_array)
    

In [None]:
df_cv

In [None]:
def first_level_pred(cv_row, estimator, X, y, method='predict'):
    train, test = cv_row['train'], cv_row['test']
    estimator.fit(X.iloc[train], y[train])
    preds = getattr(estimator, method)(X.iloc[test])
    df_out = pd.DataFrame(dict(preds=preds))
    df_out['fold_idx'] = cv_row['fold_idx']
    df_out['repeat'] = cv_row['rep']
    df_out['sample_index'] = test
    return df_out

In [None]:
# covs
mod_covs = make_pipeline(
     coffeine.make_filter_bank_transformer(
         names=X_df_covs.columns,
         projection_params=dict(n_compo=65, reg=1e-15, scale =1),  # for SSS
         method='riemann'
     ),
     StandardScaler(),
     RidgeClassifierCV(alphas=np.logspace(-5, 10, 50)),
 )

# psds
mod_psd = make_pipeline(
     coffeine.make_filter_bank_transformer(
         names=X_df_covs.columns,  
         method='log_diag'
     ),
     StandardScaler(),
     RidgeClassifierCV(alphas=np.logspace(-5, 10, 50)),
 )

# wpli
mod_wpli_upper = make_pipeline(
     coffeine.make_filter_bank_transformer(
         names=X_df_dwpli.columns,
         method='naive'
     ),
     StandardScaler(),
     RidgeClassifierCV(alphas=np.logspace(-3, 5, 50)),
 )

# rplain
mod_rplain = make_pipeline(
   coffeine.make_filter_bank_transformer(
       names=X_df_rplain.columns,
       projection_params=dict(n_compo=65, reg=1e-15, scale =1),  # for SSS
       method='riemann'
   ),
StandardScaler(),
RidgeClassifierCV(alphas=np.logspace(-3, 5, 50)),
)


In [None]:
cv_check_psd = cross_val_score(   # test
   X=X_df_covs,
   y=y,
   estimator=mod_psd,
   cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=10),
   scoring='roc_auc')
print(cv_check_psd.mean())

In [None]:
cv_check_cov = cross_val_score(
    X=X_df_covs,
    y=y,
    estimator=mod_covs,
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=10),
    scoring='roc_auc')
print(cv_check_cov.mean())

In [None]:
cv_check_wpli = cross_val_score(
    X=X_df_dwpli,
    y=y,
    n_jobs=4,
    estimator=mod_wpli_upper,
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=10),
    scoring='roc_auc')
print(cv_check_wpli.mean())
# naive model wpli score: 0.66

In [None]:
# to compute cross-val
cv_preds_covs = Parallel(n_jobs=4)(
    delayed(first_level_pred)(
        cv_row=cv_row, X=X_df_covs, y=y, method='decision_function',
        estimator=mod_covs
    ) for ii, cv_row in df_cv.iterrows()
)

cv_preds_psd = Parallel(n_jobs=4)(
    delayed(first_level_pred)(
        cv_row=cv_row, X=X_df_covs, y=y, method='decision_function',
        estimator=mod_psd
    ) for ii, cv_row in df_cv.iterrows()
)

cv_preds_wpli = Parallel(n_jobs=4)(
    delayed(first_level_pred)(
        cv_row=cv_row, X=X_df_dwpli, y=y, method='decision_function',
        estimator= mod_wpli_upper
    ) for ii, cv_row in df_cv.iterrows()
)

cv_preds_rplain = Parallel(n_jobs=4)(
   delayed(first_level_pred)(
       cv_row=cv_row, X=X_df_rplain, y=y, method='decision_function',
       estimator=mod_rplain
   ) for ii, cv_row in df_cv.iterrows()
)

In [None]:
cv_preds_rplain[0].head()

In [None]:
## sauvegarde en csv, décommenter en fonction de la métrique étudiée
pd.concat(cv_preds_covs).rename(columns={'preds': 'covs'}).to_csv('./cv_preds_covs.csv')
pd.concat(cv_preds_psd).rename(columns={'preds': 'psd'}).to_csv('./cv_preds_psd.csv')
pd.concat(cv_preds_wpli).rename(columns={'preds': 'wpli'}).to_csv('./cv_preds_wpli_upper.csv')
pd.concat(cv_preds_rplain).rename(columns={'preds': 'rplain'}).to_csv('./cv_preds_rplain_reg.csv')