In [1]:
import numpy as np
import pandas as pd

import os
import os.path as op
from glob import glob
from nilearn.input_data import NiftiMasker

In [2]:
os.environ['PAC_PATH'] = '/path/to/pac2018/'

In [3]:
# training data
template = os.path.join(os.getenv('PAC_PATH'),
                        'data/PAC2018*nii')
fl = sorted(glob(template))
subs = [val.split('/')[-1].split('.')[0] for val in fl]
df = pd.read_csv(op.join(op.dirname(template), 'covars', 'PAC2018_Covariates.csv'), 
                 skipfooter=11, engine='python')
df.head()

Unnamed: 0,PAC_ID,Label,Age,Gender,TIV
0,PAC2018_0001,1,56,1,1793.0
1,PAC2018_0002,1,21,1,1565.049438
2,PAC2018_0004,1,33,2,1525.0
3,PAC2018_0005,1,33,1,1800.0
4,PAC2018_0006,1,61,2,1433.959595


In [4]:
masker = NiftiMasker(standardize=False,
                     smoothing_fwhm=2,
                     memory='nilearn_cache')
training_data = masker.fit_transform(fl)

In [5]:
print(len(subs), subs[0], training_data.shape)

1792 PAC2018_0001 (1792, 586507)


In [6]:
np.savez('masked.npz', subs=subs, training_data=training_data, covars=df.values, 
         keys=df.keys())

In [7]:
import pickle
with open('masker.pkl', 'wb') as fp:
    pickle.dump(masker, fp)

In [8]:
# test data
template = os.path.join(os.getenv('PAC_PATH'),
                        'testdata/PAC2018*nii')
fl = sorted(glob(template))
subs = [val.split('/')[-1].split('.')[0] for val in fl]
df = pd.read_csv(op.join(op.dirname(template), 'covars', 'PAC2018_Test_Covariates.csv'), 
                 skipfooter=11, engine='python')
df.head()

Unnamed: 0,PAC_ID,Scanner,Age,Gender,TIV
0,PAC2018_0003,2,54,2,1349.0
1,PAC2018_0007,1,65,2,1456.259399
2,PAC2018_0014,2,36,2,1388.0
3,PAC2018_0022,2,32,1,1544.0
4,PAC2018_0029,3,30,1,1859.0


In [9]:
test_data = masker.transform(fl)

In [10]:
print(len(subs), subs[0], test_data.shape)

448 PAC2018_0003 (448, 586507)


In [11]:
np.savez('masked-test.npz', subs=subs, test_data=test_data, covars=df.values, keys=df.keys())

In [12]:
from sklearn.cluster import KMeans
clusters = KMeans(n_clusters=2).fit(np.vstack((training_data, test_data))).labels_[:, None]
np.savez('train+test_cluster.npz', clusters=clusters)