<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/source/Timeseries_Cross_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Adolescent and Young Adult Group Whole Brain Cross Validation
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training per group (subject ids of '100XX-XXXXX' for Adolescent (child) and '300XX-XXXXX' for Young Adult)

## Adolescent Time Series Cross Validation
### Organize Data Adolescent for Cross Validation
If we are performing cross validation, we need to get the data without custom splitting

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount = True)

Mounted at /content/gdrive


In [2]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source
!ls

Cloning into 'teambrainiac'...
remote: Enumerating objects: 819, done.[K
remote: Counting objects: 100% (819/819), done.[K
remote: Compressing objects: 100% (605/605), done.[K
remote: Total 819 (delta 514), reused 399 (delta 198), pack-reused 0[K
Receiving objects: 100% (819/819), 73.28 MiB | 30.02 MiB/s, done.
Resolving deltas: 100% (514/514), done.
/content/teambrainiac/source
access_data.py			  models
AccuracyMeasures.ipynb		  process.py
analysis.py			  SingleSubjectSVM.ipynb
cross_validation.py		  SubjectVisualization_Models_ZNORM.ipynb
data				  SVM_Group_Adolescent_Whole_brain.ipynb
DataExploration_SingleSubj.ipynb  SVM_Group_YA_Whole_brain.ipynb
DL				  TestMask.ipynb
Explore_data.ipynb		  train.py
Group_All_MASK_SVM.ipynb	  utils.py
helper				  VisualizationPlayground.ipynb
Images				  Visualize_Data.ipynb
__init__.py			  visualize.py


In [3]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving path_config.py to path_config.py
User uploaded file "path_config.py" with length 228 bytes


In [9]:
# Import libraries
!pip install boto3 nilearn
import pickle
from visualize import plot_alphas
from train import run_grp_svm_model
from analysis import create_bmaps, get_threshold_image, metrics
from process import *
from access_data import *
from sklearn.experimental import enable_halving_search_cv
from cross_validation import time_series_cv
from sklearn.model_selection import TimeSeriesSplit, HalvingGridSearchCV

from sklearn.svm import SVC
import numpy as np
from collections import defaultdict
import nibabel as nib
from nilearn import plotting
import matplotlib.pyplot as plt



In [12]:
%%time
# Get mat file 
mask_type = 'mask' #'mask', 'masksubACC', 'masksubAI', 'masksubNAcc', 'masksubmPFC'
label_type = 'rt_labels' 
psc_norm = False
from_mat = 'unnorm' #unnorm
data_type = "AD_AD"
m_path_ind = 0 #get sub_mask data in mask_data key
l_path_ind = 0 #get data in data from rt_label key

#filename to save data in as a pickle, locally
filename = f"all_data_{mask_type}_{from_mat}_2d.pkl"

#open path dictionary file to get subject ids
dict_path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(dict_path)
data_path = "data/"
subject_ids = data_path_dict['subject_ID']

# process data if not in AWS already
obj_name, _, _ = access_aws()
statement = False
for i in obj_name:
  if filename in i.key:
    statement = True

if statement == False:
  print("Masking mat data...")
  data = masked_data_n_labels(mask_type, label_type, data_path_dict, psc_norm, m_path_ind, l_path_ind)
  print(f"Saving data to local path {data_path}{filename}")
  f = open(f"{data_path}{filename}", "wb")
  pickle.dump(data, f)
  f.close()
else:
  print("Loading in masked data")
  data = access_load_data(filename, False)


Loading in masked data
CPU times: user 41.9 s, sys: 1min 9s, total: 1min 50s
Wall time: 1min 19s


In [13]:
%%time 
# Split dataset into Adolescents adults
ad = subject_ids[:33]
print("Number of subjects for Adolescent ", len(ad))

#Function Calls
znorm = "RUNS" 
x_id = ad[:13]
xt_id = ad[13:]
group_sub_ids = (x_id, xt_id)
runs_train = [1,2] # Grab run 2, 3 (0 indexing)
runs_test = [1,2] 


X, y = data_for_cv(data, group_sub_ids, runs_train, runs_test, znorm)

Number of subjects for Adolescent  33
Normalizing Each based on RUNS...
Final X data shape to feed into Time Series Cross Validation (5544, 237979)
Final y data shape to feed into Time Series Cross Validation (5544,)
CPU times: user 34 s, sys: 52.6 s, total: 1min 26s
Wall time: 1min 26s


### Run the Adolescent Time Series Cross Validation

In [15]:
max_train = 2500
test_size = None
splits = 5

directory = 'metrics/group_svm/whole/'
#directory = 'metrics/group_svm/subpfc/'
param_dict = {'C': [1, 5, 10],
              'gamma': [0.01, 'auto', 'scale'],
              'max_iter': [1000]}
gd_srch = True
file_name = f'{directory}{data_type}_{mask_type}_grdsrchcv_1.csv'

time_series_cv(X, y, max_train, test_size, splits, gd_srch, param_dict, file_name)



Uploading gridsearch results to cloud...
upload complete for metrics/group_svm/whole/AD_AD_mask_grdsrchcv_1.csv
Best parameters:  {'C': 10, 'gamma': 'scale'}
Best estimator:  SVC(C=10, class_weight='balanced', max_iter=1000, random_state=42)
Best score:  0.7064935064935065


In [16]:
## Cross Validation accuracy scores recorded

dictionary = {"Cross Validation Adolescent Train run 2/Test run 3": ["1","2","3","4","5","Average"], 
              "Unnormalized": [.50, .51, .50, .50, .50, .50],
              "PSC per run": [.53, .63, .50, .50, .59, .55 ], 
              "PSC + Z Norm per Subject": [.63, .72, .62, .58, .52, .61], 
              "PSC + Z Norm per Run": [.68, .75, .73, .71, .77, .73], 
              "Z Norm per Run (No PSC)": [.62, .71, .72, .65, .63, .67]
              }

df = pd.DataFrame(dictionary)
df.set_index("Cross Validation Adolescent Train run 2/Test run 3", inplace = True)
df

Unnamed: 0_level_0,Unnormalized,PSC per run,PSC + Z Norm per Subject,PSC + Z Norm per Run,Z Norm per Run (No PSC)
Cross Validation Adolescent Train run 2/Test run 3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.5,0.53,0.63,0.68,0.62
2,0.51,0.63,0.72,0.75,0.71
3,0.5,0.5,0.62,0.73,0.72
4,0.5,0.5,0.58,0.71,0.65
5,0.5,0.59,0.52,0.77,0.63
Average,0.5,0.55,0.61,0.73,0.67


## Young Adult Time Series Cross Validation
### Organize Data Young Adult for Cross Validation
If we are performing cross validation, we need to get the data without custom splitting

In [5]:
%%time
# Get mat file 
mask_type = 'mask' #'mask', 'masksubACC', 'masksubAI', 'masksubNAcc', 'masksubmPFC'
label_type = 'rt_labels' 
psc_norm = False
from_mat = 'unnorm' #'psc' 'unnorm'
data_type = "YA_YA"
m_path_ind = 0 #get sub_mask data in mask_data key
l_path_ind = 0 #get data in data from rt_label key



#open path dictionary file to get subject ids
dict_path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(dict_path)
data_path = "data/"
filename2 = f"all_data_{mask_type}_{from_mat}_2d.pkl"
subject_ids = data_path_dict['subject_ID']



# process data if not in AWS already
obj_name, _, _ = access_aws()
statement = False
for i in obj_name:
  if filename2 in i.key:
    statement = True

if statement == False:
  print("Masking mat data...")
  data = masked_data_n_labels(mask_type, label_type, data_path_dict, psc_norm, m_path_ind, l_path_ind)
  print(f"Saving data to local path {data_path}{filename2}")
  f = open(f"{data_path}{filename2}", "wb")
  pickle.dump(data, f)
  f.close()
elif statement == True:
  print("Loading in masked data")
  data = access_load_data(filename2, False)


Loading in masked data
CPU times: user 30.7 s, sys: 41.2 s, total: 1min 11s
Wall time: 1min 5s


In [6]:
%%time 
# Split dataset into young adults
ya = subject_ids[33:]
print("Number of subjects for Adolescent ", len(ya))

#Function Calls
znorm = "RUNS" 
x_id = ya[:13]
xt_id = ya[13:]
group_sub_ids = (x_id, xt_id)
runs_train = [1,2] # Grab run 2, 3 (0 indexing)
runs_test = [1,2] 

X, y = data_for_cv(data, group_sub_ids, runs_train, runs_test, znorm)

Number of subjects for Adolescent  19
Normalizing Each based on RUNS...
Final X data shape to feed into Time Series Cross Validation (3192, 237979)
Final y data shape to feed into Time Series Cross Validation (3192,)
CPU times: user 15.8 s, sys: 13.5 s, total: 29.4 s
Wall time: 29.3 s


### Run the Time Series Cross Validation

In [11]:
max_train = 1000
test_size = None
splits = 5

directory = 'metrics/group_svm/whole/'
#directory = 'metrics/group_svm/subpfc/'
param_dict = {'C': [1, 5, 10],
              'gamma': [0.01, 'auto', 'scale'],
              'max_iter': [1000]}
gd_srch = True
file_name2 = f'{directory}{data_type}_{mask_type}_grdsrchcv.pkl'

time_series_cv(X, y, max_train, test_size, splits, gd_srch, param_dict, file_name2)



Uploading gridsearch results to cloud...
upload complete for metrics/group_svm/whole/YA_YA_mask_grdsrchcv.pkl
Best parameters:  {'C': 10, 'gamma': 'scale'}
Best estimator:  SVC(C=10, class_weight='balanced', max_iter=1000, random_state=42)
Best score:  0.7408662900188323


In [None]:
## Cross Validation accuracy scores recorded

dictionary = {"Cross Validation Young Adult Train run 2/Test run 3": ["1","2","3","4","5","Average"], "Unnormalized": [.49, .54, .51, .49, .50, .51],
              "PSC per run": [.49, .61, .60, .62, .50, .56 ], "PSC + Z Norm per Run": [.77, .76, .82, .79, .80, .79],
              "Z Norm per Run (No PSC)": [.77, .75, .82, .81, .80, .79]}

df = pd.DataFrame(dictionary)
df.set_index("Cross Validation Young Adult Train run 2/Test run 3", inplace = True)
df

In [None]:
# For YA znorm whole mask
"""
Predicting...
1it [03:17, 197.81s/it]Cross Validation Split 1 Accuracy score: 0.6898496240601504
Fitting Classifier for iteration number 2
Predicting...
2it [10:07, 322.61s/it]Cross Validation Split 2 Accuracy score: 0.8120300751879699
Fitting Classifier for iteration number 3
Predicting...
3it [20:03, 447.25s/it]Cross Validation Split 3 Accuracy score: 0.6917293233082706
Fitting Classifier for iteration number 4
Predicting...
4it [31:52, 550.50s/it]Cross Validation Split 4 Accuracy score: 0.7612781954887218
Fitting Classifier for iteration number 5
Predicting...
5it [46:36, 559.36s/it]Cross Validation Split 5 Accuracy score: 0.7800751879699248
Mean Accuracy: 0.7469924812030075"""