<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/SingleSubjectSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Whole Brain Support Vector Machine Training
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training all subjects
- SVM training per subject

### Mount Google Drive and clone repository
- open to source directory

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount = True)

Mounted at /content/gdrive


In [2]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source
!ls


fatal: destination path 'teambrainiac' already exists and is not an empty directory.
/content/teambrainiac/source
Access_Load_Data.ipynb		  Mat_to_Numpy.ipynb
AccuracyMeasures.ipynb		  models
All_subject_masked_labeled.ipynb  path_config.py
cross_validation.py		  percent_signal_change.ipynb
data				  process.py
Explore_data.ipynb		  __pycache__
explore.py			  SingleSubjectSVM.ipynb
Group_All_MASK_SVM.ipynb	  SVM_Group_Child_Whole_Brain.ipynb
Images				  SVM_Group_YA_Whole_brain.ipynb
__init__.py			  utils.py
Masking.ipynb			  Visualize_Data.ipynb


### Load path_config.py 
- we are already in source so we can just load this file without chanding directory

In [4]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving path_config.py to path_config (1).py
User uploaded file "path_config.py" with length 228 bytes


### Load the Whole brain normalized masked all subject 2d pickle file 
- go to Drive outside of this notebook
  - create a folder named data
  - upload 'whole_brain_all_norm_2d.pkl' - will take 5 hours but should remain on system without need to upload again
  - once uploaded, drag the file to the repo source/data directory

In [None]:
!ls

 Access_Load_Data.ipynb		   'path_config (2).py'
 All_subject_masked_labeled.ipynb  'path_config (3).py'
 cross_validation.py		    path_config.py
 data				    percent_signal_change.ipynb
 Explore_data.ipynb		    process.py
 explore.py			    __pycache__
 Images				    SVM_Group_Child_Whole_Brain.ipynb
 __init__.py			    SVM_Group_YA_Whole_brain.ipynb
 Masking.ipynb			    teambrainiac
 Mat_to_Numpy.ipynb		    utils.py
 models				    Visualize_Data.ipynb
'path_config (1).py'


### Import libraries


In [7]:

# Import libraries
!pip install boto3 nilearn nibabel
from sklearn.model_selection import GridSearchCV
import pickle
from utils import data_to_nib, load_mat, open_pickle, access_load_data, save_data, create_mask, labels_mask_binary, masking_data, masked_data_n_labels
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import random
import cv2 as cv



### Load Data from AWS and create Train/Val/Test splits

In [7]:
%%time
# pkl_file = "whole_brain_all_norm_2d.pkl" # normalized
# pkl_file_all = 'all_data_dictionary.pkl' #Unnormalized
# bool_mat = False
# data = access_load_data(pkl_file, bool_mat)
# #data_unnorm = access_load_data(pkl_file_all,bool_mat)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.72 µs


In [None]:
#open path dictionary file to get subject ids
# path = "data/data_path_dictionary.pkl"
# data_path_dict = open_pickle(path)
# subject_ids = data_path_dict['subject_ID']

# # Randomly shuffle ids for train test val splits
# random.seed(42)
# random.shuffle(subject_ids)

In [None]:
# mask_data_path = data_path_dict['mask_data'][0]
# mask_type_dict = access_load_data(mask_data_path,True)
# np_array_mask = mask_type_dict['mask']

In [None]:
# indices_mask = np.where(np_array_mask==1)
# indices_mask

(array([ 3,  3,  3, ..., 75, 75, 75]),
 array([36, 36, 36, ..., 47, 47, 47]),
 array([33, 34, 35, ..., 28, 29, 30]))

In [8]:
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)
subject_ids = data_path_dict['subject_ID']
subj_paths = data_path_dict['subject_data']
label_data_path = data_path_dict['labels'][0]
label_type = 'rt_labels'
mask_labels_indices, binary_labels = labels_mask_binary(label_data_path, label_type)

mask_data_path = data_path_dict['mask_data'][0]
mask = create_mask(mask_data_path,mask_type='mask')


In [9]:
from utils import *
import numpy as np
from sklearn.preprocessing import StandardScaler
def scale_data_single_subj(sub_data,sub_labels, runs_train,runs_test,norm):
  """
    data         : (1 subject data, keys as subject ID for frmi data or labels)
    runs_train   : tuple , (which run are we using for the training data)
    runs_test    : tuple, (which run are we using for the test data)
    norm         : string, ("RUNS": normalizing separately on each run;
                            "SUBJECT": Normalizing separately by each subject)
    returns      : nd.arrays, Concatenated X data of (time points, x*y*z) x = 79, y = 95, z = 75
                   and Concatenated y labels of (time points,)
    """

    # train and labels
  X = []
  y = []
  ##val and labels
  Xv = []
  yv = []
  # TEST and labels
  Xt = []
  yt = []
  if len(runs_train)>1:
      for run in runs_train:
        X.append(sub_data[run])
        y.append(sub_labels[run])
      
      X = np.concatenate(np.array(X))
      y = np.concatenate(np.array(y))
  else:
      X = sub_data[runs_train[0]-1]
      y = sub_labels[runs_train[0]-1]
  if len(runs_test)> 1:
      Xv = sub_data[runs_test[0]-1]
      yv = sub_labels[runs_test[0]-1]
      Xt = sub_data[runs_test[1]-1]
      yt = sub_labels[runs_test[1]-1]
  else:
      Xt = sub_data[runs_test[0]-1]
      yt = sub_labels[runs_test[0]-1]
  if norm == "RUNS":
      scalar = StandardScaler()
      X = scalar.fit_transform(X)

      scalarT = StandardScaler()
      Xt = scalarT.fit_transform(Xt)
      if len(Xv)>0:
        Xv = scalarT.fit_transform(Xv)
  elif norm == "SUBJECT":
      #print(f"Normalizing Each Subject Data for group {group_sub_ids}")
      scalar = StandardScaler().fit(X)
      X = scalar.transform(X)
      

      Xt = scalar.transform(Xt)
      if len(Xv)>0:
        Xv = scalar.transform(Xv)
  else:
      print('Not doing standardization')      
  return X, y, Xt, yt, Xv, yv

In [24]:
from sklearn.metrics import accuracy_score
def run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm="none",do_cv=False):
  #if norm!="none":
  X_train, y_train, X_test, y_test, X_val, y_val = scale_data_single_subj(sub_data,sub_labels,runs_train,runs_test,norm)
  # else:
  #   X_train = sub_data[1]
  #   y_train = sub_labels[1]
  #   X_val = sub_data[2]
  #   y_val = sub_labels[2]
  #   X_test = sub_data[3]
  #   y_test = sub_labels[3]
  if do_cv:
    c_params = {'C':[0.7, 1, 5, 10],  'random_state':[111,222],'kernel':['linear', 'rbf']}
    svc = SVC()
    clf = GridSearchCV(svc, c_params)
    clf.fit(X_train, y_train)
    if len(X_val)>0:
      y_val_predicts = clf.predict(X_val)
      val_acc = accuracy_score(y_val,y_val_predicts)
    else:
      val_acc = 0
    y_test_predicts = clf.predict(X_test)
    test_acc = accuracy_score(y_test,y_test_predicts)
    return clf,val_acc,test_acc
  else:
    clf = SVC(C=1.0,kernel='linear')
    clf.fit(X_train,y_train)

  return clf,X_train,y_train,X_test,y_test,X_val,y_val

In [11]:
def load_subject_data(idx,dopsc):
  mat_path = subj_paths[idx]
  sub_id = subject_ids[idx]
  data = access_load_data(mat_path,True)
  user_data_dict = {}
  user_data_dict[sub_id], bi_lb = masking_data(data, mask, mask_labels_indices, binary_labels, dopsc)
  user_data_dict[f"{sub_id}_{label_type}"] = bi_lb
  return user_data_dict, sub_id
#mask_labels_indices, binary_labels = labels_mask_binary(label_data_path, label_type)


In [25]:

def run_subject_model(subject_ids, runs_train,runs_test,path,do_psc=True,norm='none'):
  file_name_path = []
  for idx in range(len(subject_ids)):
    model_dict = {}
    user_data_dict, sub_id = load_subject_data(idx,do_psc)
    sub_data = user_data_dict[sub_id]
    sub_labels = user_data_dict[f"{sub_id}_rt_labels"]
    clf,X_train, y_train, X_test, y_test, X_val, y_val = run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm)
    model_dict[sub_id] = {}
    model_dict[sub_id]['model'] = clf
    model_dict[sub_id]['X_train'] = X_train
    model_dict[sub_id]['y_train'] = y_train
    model_dict[sub_id]['X_test'] = X_test
    model_dict[sub_id]['y_test'] = y_test
    if len(X_val)>0:
      model_dict[sub_id]['X_val'] = X_val
      model_dict[sub_id]['y_val'] = y_val
    destination_path = f"{path}/sub_id_one_run_model_psconly{sub_id}.pkl"
    file_name_path.append([sub_id,destination_path])
    f = open(destination_path, "wb")
    pickle.dump(model_dict, f)
    f.close()
  return file_name_path

In [None]:
file_name_pd = pd.DataFrame(file_name_path,columns= ['sub_id','pickle_path'])
file_name_pd.to_csv("/content/drive/My Drive/data/model_one_run_zcoreonly/subject_paths.csv")


In [None]:
file_name_path = run_subject_model(subject_ids,(2,),(3,4),"/content/drive/My Drive/data/model_one_run_psc_only",do_psc=True,norm='none')

In [13]:
c_params = {'C':[0.7, 1, 5, 10],  'random_state':[111,222],'kernel':['linear', 'rbf']}
svc = SVC()
clf = GridSearchCV(svc, c_params)
clf.fit(X_train, y_train)

NameError: ignored

In [26]:
def run_subject_model_cv(subject_ids, runs_train,runs_test,do_psc=True,norm='none'):
  cv_dict = {}
  for idx in range(len(subject_ids)):
    model_dict = {}
    user_data_dict, sub_id = load_subject_data(idx,do_psc)
    sub_data = user_data_dict[sub_id]
    sub_labels = user_data_dict[f"{sub_id}_rt_labels"]
    clf,val_acc,test_acc = run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm,do_cv=True)
    cv_dict[sub_id] = {}
    cv_dict[sub_id]['model'] = clf
    cv_dict[sub_id]['val_acc'] = val_acc
    cv_dict[sub_id]['test_acc'] = test_acc
   
  return cv_dict

In [27]:
cv_results = run_subject_model_cv(subject_ids,(2,),(3,4),do_psc=False,norm='RUNS')

100%|██████████| 4/4 [00:00<00:00,  5.34it/s]
100%|██████████| 4/4 [00:00<00:00,  5.57it/s]
100%|██████████| 4/4 [00:00<00:00,  6.54it/s]
100%|██████████| 4/4 [00:00<00:00,  6.40it/s]
100%|██████████| 4/4 [00:00<00:00,  6.34it/s]
100%|██████████| 4/4 [00:00<00:00,  6.56it/s]
100%|██████████| 4/4 [00:00<00:00,  6.13it/s]
100%|██████████| 4/4 [00:00<00:00,  6.33it/s]
100%|██████████| 4/4 [00:00<00:00,  6.47it/s]
100%|██████████| 4/4 [00:00<00:00,  6.09it/s]
100%|██████████| 4/4 [00:00<00:00,  6.24it/s]
100%|██████████| 4/4 [00:00<00:00,  6.34it/s]
100%|██████████| 4/4 [00:00<00:00,  6.14it/s]
100%|██████████| 4/4 [00:00<00:00,  6.69it/s]
100%|██████████| 4/4 [00:00<00:00,  6.58it/s]
100%|██████████| 4/4 [00:00<00:00,  6.58it/s]
100%|██████████| 4/4 [00:00<00:00,  6.34it/s]
100%|██████████| 4/4 [00:00<00:00,  6.69it/s]
100%|██████████| 4/4 [00:00<00:00,  6.59it/s]
100%|██████████| 4/4 [00:00<00:00,  6.01it/s]
100%|██████████| 4/4 [00:00<00:00,  6.13it/s]
100%|██████████| 4/4 [00:00<00:00,

In [19]:
path = "/content/drive/My Drive/data/cv_results.pkl"
f = open(path, "wb")
pickle.dump(cv_results, f)
f.close()

## Set up SVM Model

#### SKlearn model

In [None]:
final_results = pd.DataFrame()
for sub_id in subject_ids:
  cv_subject_results = cv_results[sub_id]
  clf = cv_subject_results['model']
  sub_df = pd.DataFrame(clf.cv_results_)
  trans_form_df = trans_form_df = sub_df.pivot_table(index='sub_id',columns=['param_C','param_kernel','param_random_state'],values=['mean_test_score'])
  

In [60]:
test_df = cv_results[subject_ids[0]]
sub_model = test_df['model']
sub_df = sub_df[['mean_test_score','param_C','param_kernel','param_random_state']]
trans_form_df = sub_df.pivot_table(index='sub_id',columns=['param_C','param_kernel','param_random_state'],values=['mean_test_score'])

len(tran_form_df)

KeyError: ignored

In [59]:
cv_subject_results = cv_results[sub_id]

NameError: ignored

In [58]:
sub_df['sub_id'] = 
trans_form_df = sub_df.pivot_table(index='sub_id',columns=['param_C','param_kernel','param_random_state'],values=['mean_test_score'])
trans_form_df

Unnamed: 0_level_0,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score,mean_test_score
param_C,0.7,0.7,0.7,0.7,1.0,1.0,1.0,1.0,5.0,5.0,5.0,5.0,10.0,10.0,10.0,10.0
param_kernel,linear,linear,rbf,rbf,linear,linear,rbf,rbf,linear,linear,rbf,rbf,linear,linear,rbf,rbf
param_random_state,111,222,111,222,111,222,111,222,111,222,111,222,111,222,111,222
sub_id,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
10004_08693,0.702941,0.702941,0.832353,0.832353,0.702941,0.702941,0.869118,0.869118,0.702941,0.702941,0.845588,0.845588,0.702941,0.702941,0.845588,0.845588


In [55]:
sub_df['val_acc'] = cv_subject_results['val_acc']
sub_df['test_acc'] = cv_subject_results['test_acc']

NameError: ignored