In [None]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.pipeline import make_pipeline
from sklearn.impute import IterativeImputer
from sklearn.linear_model import RidgeClassifierCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [None]:
features_all_keys = pd.read_csv('./input/features_all_keys.csv')
features_all_keys = [k for k in features_all_keys['subject']]
print(len(features_all_keys))

In [None]:
participants_fname = './input/participants.tsv'
subject_df = pd.read_csv(participants_fname, delimiter='\t')
subject_df['participant_id'] = subject_df['participant_id'].str.replace('sub-', '')
subject_df = subject_df.set_index('participant_id')
# dataframe in correct order
subject_df = subject_df.loc[features_all_keys]
subject_converter_df = subject_df.dropna(subset=['Converters']) # create new df use JUST for STATS 
feature_converter = subject_df['Converters'].fillna(-1)

In [None]:
y = feature_converter[feature_converter!=-1]

In [None]:
print(len(y))

## MRI Data

In [None]:
mri_df = pd.read_csv('./input/IRM_raw_data.csv').drop(['Unnamed: 0'], axis=1)
mri_df = mri_df[
    [
        'participant_id', 'Left-Lateral-Ventricle', 'Left-Inf-Lat-Vent',
        'Left-Cerebellum-White-Matter', 'Left-Cerebellum-Cortex',
        'Left-Thalamus', 'Left-Caudate', 'Left-Putamen', 'Left-Pallidum',
        '3rd-Ventricle', '4th-Ventricle', 'Brain-Stem', 'LeftHippocampus',
        'LeftAmygdala', 'CSF', 'Left-Accumbens-area', 'Left-VentralDC',
        'Left-vessel', 'Left-choroid-plexus', 'Right-Lateral-Ventricle',
        'Right-Inf-Lat-Vent', 'Right-Cerebellum-White-Matter',
        'Right-Cerebellum-Cortex', 'Right-Thalamus', 'Right-Caudate',
        'Right-Putamen', 'Right-Pallidum', 'RightHippocampus', 'RightAmygdala',
        'Right-Accumbens-area', 'Right-VentralDC', 'Right-vessel',
        'Right-choroid-plexus', '5th-Ventricle', 'WMHyperintensities',
        'Left-WM-hypointensities', 'Right-WM-hypointensities',
        'non-WM-hypointensities', 'Left-non-WM-hypointensities',
        'Right-non-WM-hypointensities', 'Optic-Chiasm', 'CC_Posterior',
        'CC_Mid_Posterior', 'CC_Central', 'CC_Mid_Anterior', 'CC_Anterior',
        'BrainSegVol', 'BrainSegVolNotVent', 'lhCortexVol', 'rhCortexVol',
        'CortexVol', 'lhCerebralWhiteMatterVol', 'rhCerebralWhiteMatterVol',
        'CerebralWhiteMatterVol', 'SubCortGrayVol', 'TotalGrayVol',
        'SupraTentorialVol', 'SupraTentorialVolNotVent', 'MaskVol',
        'BrainSegVol-to-eTIV', 'MaskVol-to-eTIV', 'lhSurfaceHoles',
        'rhSurfaceHoles', 'SurfaceHoles', 'EstimatedTotalIntraCranialVol',
        'ventricles', 'Hippocampus', 'Amygdala', 'TotalBrain'
        ]
]


missing_subjects = ['Sub0070', 'Sub0078', 'Sub0282', 'Sub0285', 'Sub0289', 'Sub0306',
                    'Sub0308', 'Sub0318', 'Sub0319', 'Sub0321', 'Sub0004', 'Sub0324',
                    'Sub0127', 'Sub0136', 'Sub0144', 'Sub0153', 'Sub0167', 'Sub0195', 'Sub0235']
missing_rows = [{'participant_id': sub} for sub in missing_subjects]
mri_df = pd.concat([mri_df, pd.DataFrame(missing_rows)])

# dataframe in correct order
mri_df = mri_df.set_index('participant_id')
mri_df = mri_df.loc[features_all_keys]

In [None]:
print(len(mri_df))

In [None]:
X_df= mri_df[feature_converter!=-1]
X_df.shape

In [None]:
np.all(y.index == X_df.index)

In [None]:
X_df.isna().sum()

In [None]:
def string_to_array(s):
    return np.fromstring(s.strip('[]'), sep=' ', dtype=int)

In [None]:
X_df = X_df.reset_index().drop('participant_id', axis = 1)
X_df = X_df.values

In [None]:
X_df

In [None]:
def first_level_pred(cv_row, estimator, X, y, method='predict'):
    train, test = cv_row['train'], cv_row['test']
    # kick out cases with missing values for fitting
    mask_train = np.isnan(X[train, 0])
    train_ = train[~mask_train]
    estimator.fit(X[train_], y[train_])
    # do the same for testing
    mask_test = np.isnan(X[test, 0])
    test_ = test[~mask_test]
    preds_ = getattr(estimator, method)(X[test_])
    # fill nans where you had no data
    preds = np.zeros(len(test))
    preds[:] = np.nan
    preds[~mask_test] = preds_
    df_out = pd.DataFrame(dict(preds=preds))
    df_out['fold_idx'] = cv_row['fold_idx']
    df_out['repeat'] = cv_row['rep']
    df_out['sample_index'] = test
    return df_out

In [None]:
n_splits = 10 # the number of CV splits
n_repeat = 10 # the number of randomized repetitions to get a nicer distribution
# Consider putting n_repeats to 1 for the beginning.
df_cv = pd.concat([
    pd.DataFrame(
        list(StratifiedKFold(n_splits=n_splits, random_state=ii * 7, shuffle=True).split(X_df, y)),
        columns=['train', 'test']).assign(rep=ii, fold_idx=range(n_splits)) for ii in range(n_repeat)]
)    

if True: # write/read test
    df_cv.to_csv('df_cv.csv', index=False)
    df_cv = pd.read_csv(
        'df_cv.csv', dtype=[('train', np.ndarray), 
                            ('test', np.ndarray),
                            ('rep', int),
                            ('fold_idx', int)]
    )
    df_cv['train'] = df_cv['train'].apply(string_to_array)
    df_cv['test'] = df_cv['test'].apply(string_to_array)
    

In [None]:
mod_mri = make_pipeline(
    IterativeImputer(),
    StandardScaler(),
    RidgeClassifierCV(alphas=np.logspace(-3, 5, 50)),
)

In [None]:
cv_preds_mri = Parallel(n_jobs=1)(
    delayed(first_level_pred)(
       cv_row=cv_row, X=X_df, y=y, method='decision_function',
       estimator=mod_mri
   ) for ii, cv_row in df_cv.iterrows()
)

In [None]:
print(cv_preds_mri)

In [None]:
cv_preds_mri_df = (
    pd.concat(cv_preds_mri)
      .rename(columns={'preds': 'mri'})
      .to_csv('./input/cv_preds_mri.csv')
)