<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/source/SingleSubjectSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Whole Brain Support Vector Machine Training
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training all subjects
- SVM training per subject

### Mount Google Drive and clone repository
- open to source directory

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount = True)

Mounted at /content/gdrive


In [2]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source
!ls


Cloning into 'teambrainiac'...
remote: Enumerating objects: 630, done.[K
remote: Counting objects: 100% (630/630), done.[K
remote: Compressing objects: 100% (446/446), done.[K
remote: Total 630 (delta 382), reused 343 (delta 168), pack-reused 0[K
Receiving objects: 100% (630/630), 68.38 MiB | 35.78 MiB/s, done.
Resolving deltas: 100% (382/382), done.
/content/teambrainiac/source
AccuracyMeasures.ipynb	  models
analysis.py		  process.py
cross_validation.py	  SingleSubjectSVM.ipynb
data			  SubjectVisualization_Models_ZNORM.ipynb
DL			  SVM_Group_Adolescent_Whole_brain.ipynb
Explore_data.ipynb	  SVM_Group_YA_Whole_brain.ipynb
explore.py		  train.py
Group_All_MASK_SVM.ipynb  utils.py
helper			  VisualizationPlayground.ipynb
Images			  Visualize_Data.ipynb
__init__.py


### Load path_config.py 
- we are already in source so we can just load this file without chanding directory

In [4]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving path_config.py to path_config.py
User uploaded file "path_config.py" with length 228 bytes


### Load the Whole brain normalized masked all subject 2d pickle file 
- go to Drive outside of this notebook
  - create a folder named data
  - upload 'whole_brain_all_norm_2d.pkl' - will take 5 hours but should remain on system without need to upload again
  - once uploaded, drag the file to the repo source/data directory

In [None]:
!ls

 Access_Load_Data.ipynb		   'path_config (2).py'
 All_subject_masked_labeled.ipynb  'path_config (3).py'
 cross_validation.py		    path_config.py
 data				    percent_signal_change.ipynb
 Explore_data.ipynb		    process.py
 explore.py			    __pycache__
 Images				    SVM_Group_Child_Whole_Brain.ipynb
 __init__.py			    SVM_Group_YA_Whole_brain.ipynb
 Masking.ipynb			    teambrainiac
 Mat_to_Numpy.ipynb		    utils.py
 models				    Visualize_Data.ipynb
'path_config (1).py'


### Import libraries


In [5]:

# Import libraries
!pip install boto3 nilearn nibabel
from sklearn.model_selection import GridSearchCV
import pickle
from utils import data_to_nib, load_mat, open_pickle, access_load_data, save_data, create_mask, labels_mask_binary, masking_data, masked_data_n_labels
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import random
import cv2 as cv

Collecting boto3
  Downloading boto3-1.21.35-py3-none-any.whl (132 kB)
[?25l[K     |██▌                             | 10 kB 34.8 MB/s eta 0:00:01[K     |█████                           | 20 kB 42.8 MB/s eta 0:00:01[K     |███████▍                        | 30 kB 41.0 MB/s eta 0:00:01[K     |██████████                      | 40 kB 25.7 MB/s eta 0:00:01[K     |████████████▍                   | 51 kB 21.0 MB/s eta 0:00:01[K     |██████████████▉                 | 61 kB 24.3 MB/s eta 0:00:01[K     |█████████████████▎              | 71 kB 24.1 MB/s eta 0:00:01[K     |███████████████████▉            | 81 kB 26.1 MB/s eta 0:00:01[K     |██████████████████████▎         | 92 kB 28.0 MB/s eta 0:00:01[K     |████████████████████████▊       | 102 kB 27.8 MB/s eta 0:00:01[K     |███████████████████████████▏    | 112 kB 27.8 MB/s eta 0:00:01[K     |█████████████████████████████▊  | 122 kB 27.8 MB/s eta 0:00:01[K     |████████████████████████████████| 132 kB 27.8 MB/s 
[?25h

### Load Data from AWS and create Train/Val/Test splits

In [None]:
%%time
# pkl_file = "whole_brain_all_norm_2d.pkl" # normalized
# pkl_file_all = 'all_data_dictionary.pkl' #Unnormalized
# bool_mat = False
# data = access_load_data(pkl_file, bool_mat)
# #data_unnorm = access_load_data(pkl_file_all,bool_mat)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs


In [None]:
#open path dictionary file to get subject ids
# path = "data/data_path_dictionary.pkl"
# data_path_dict = open_pickle(path)
# subject_ids = data_path_dict['subject_ID']

# # Randomly shuffle ids for train test val splits
# random.seed(42)
# random.shuffle(subject_ids)

In [None]:
# mask_data_path = data_path_dict['mask_data'][0]
# mask_type_dict = access_load_data(mask_data_path,True)
# np_array_mask = mask_type_dict['mask']

In [None]:
# indices_mask = np.where(np_array_mask==1)
# indices_mask

(array([ 3,  3,  3, ..., 75, 75, 75]),
 array([36, 36, 36, ..., 47, 47, 47]),
 array([33, 34, 35, ..., 28, 29, 30]))

In [6]:
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)
subject_ids = data_path_dict['subject_ID']
subj_paths = data_path_dict['subject_data']
label_data_path = data_path_dict['labels'][0]
label_type = 'rt_labels'
mask_labels_indices, binary_labels = labels_mask_binary(label_data_path, label_type)

mask_data_path = data_path_dict['mask_data'][0]
mask = create_mask(mask_data_path,mask_type='mask')


## Set up SVM Model

In [7]:
from utils import *
import numpy as np
from sklearn.preprocessing import StandardScaler
def scale_data_single_subj(sub_data,sub_labels, runs_train,runs_test,norm):
  """
    data         : (1 subject data, keys as subject ID for frmi data or labels)
    runs_train   : tuple , (which run are we using for the training data)
    runs_test    : tuple, (which run are we using for the test data)
    norm         : string, ("RUNS": normalizing separately on each run;
                            "SUBJECT": Normalizing separately by each subject)
    returns      : nd.arrays, Concatenated X data of (time points, x*y*z) x = 79, y = 95, z = 75
                   and Concatenated y labels of (time points,)
    """

    # train and labels
  X = []
  y = []
  ##val and labels
  Xv = []
  yv = []
  # TEST and labels
  Xt = []
  yt = []
  if len(runs_train)>1:
      for run in runs_train:
        X.append(sub_data[run])
        y.append(sub_labels[run])
      
      X = np.concatenate(np.array(X))
      y = np.concatenate(np.array(y))
  else:
      X = sub_data[runs_train[0]-1]
      y = sub_labels[runs_train[0]-1]
  if len(runs_test)> 1:
      Xv = sub_data[runs_test[0]-1]
      yv = sub_labels[runs_test[0]-1]
      Xt = sub_data[runs_test[1]-1]
      yt = sub_labels[runs_test[1]-1]
  else:
      Xt = sub_data[runs_test[0]-1]
      yt = sub_labels[runs_test[0]-1]
  if norm == "RUNS":
      scalar = StandardScaler()
      X = scalar.fit_transform(X)

      scalarT = StandardScaler()
      Xt = scalarT.fit_transform(Xt)
      if len(Xv)>0:
        Xv = scalarT.fit_transform(Xv)
  elif norm == "SUBJECT":
      #print(f"Normalizing Each Subject Data for group {group_sub_ids}")
      scalar = StandardScaler().fit(X)
      X = scalar.transform(X)
      

      Xt = scalar.transform(Xt)
      if len(Xv)>0:
        Xv = scalar.transform(Xv)
  else:
      print('Not doing standardization')      
  return X, y, Xt, yt, Xv, yv

In [11]:
from sklearn.metrics import accuracy_score
def run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm="none",do_cv=False):
  #if norm!="none":
  X_train, y_train, X_test, y_test, X_val, y_val = scale_data_single_subj(sub_data,sub_labels,runs_train,runs_test,norm)
  # else:
  #   X_train = sub_data[1]
  #   y_train = sub_labels[1]
  #   X_val = sub_data[2]
  #   y_val = sub_labels[2]
  #   X_test = sub_data[3]
  #   y_test = sub_labels[3]
  if do_cv:
    c_params = {'C':[0.7, 1, 5, 10],  'random_state':[111,222],'kernel':['linear', 'rbf']}
    svc = SVC()
    clf = GridSearchCV(svc, c_params)
    clf.fit(X_train, y_train)
    if len(X_val)>0:
      y_val_predicts = clf.predict(X_val)
      val_acc = accuracy_score(y_val,y_val_predicts)
    else:
      val_acc = 0
    y_test_predicts = clf.predict(X_test)
    test_acc = accuracy_score(y_test,y_test_predicts)
    return clf,val_acc,test_acc
  else:
    clf = SVC(C=10)
    clf.fit(X_train,y_train)

  return clf,X_train,y_train,X_test,y_test,X_val,y_val

In [12]:
def load_subject_data(idx,dopsc):
  mat_path = subj_paths[idx]
  sub_id = subject_ids[idx]
  data = access_load_data(mat_path,True)
  user_data_dict = {}
  user_data_dict[sub_id], bi_lb = masking_data(data, mask, mask_labels_indices, binary_labels, dopsc)
  user_data_dict[f"{sub_id}_{label_type}"] = bi_lb
  return user_data_dict, sub_id
#mask_labels_indices, binary_labels = labels_mask_binary(label_data_path, label_type)


In [23]:

def run_subject_model(subject_ids, runs_train,runs_test,do_psc=True,norm='none'):
  single_model_path={}
  for idx in range(len(subject_ids)):
    model_dict = {}
    user_data_dict, sub_id = load_subject_data(idx,do_psc)
    sub_data = user_data_dict[sub_id]
    sub_labels = user_data_dict[f"{sub_id}_rt_labels"]
    clf,X_train, y_train, X_test, y_test, X_val, y_val = run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm)
    model_dict[sub_id] = {}
    model_dict[sub_id]['model'] = clf
    model_dict[sub_id]['X_train'] = X_train
    model_dict[sub_id]['y_train'] = y_train
    model_dict[sub_id]['X_test'] = X_test
    model_dict[sub_id]['y_test'] = y_test
    if len(X_val)>0:
      model_dict[sub_id]['X_val'] = X_val
      model_dict[sub_id]['y_val'] = y_val
    single_model_path[sub_id] = f'models/single/one_run_model_znorm_{sub_id}.pkl'
    object_name = f"models/single/one_run_model_znorm_{sub_id}.pkl"
    upload = s3_upload(model_dict,object_name,"pickle")
  return single_model_path, model_dict

In [22]:
file_name_dict,model_dict = run_subject_model(subject_ids,(2,),(3,4),do_psc=False,norm='RUNS')


100%|██████████| 4/4 [00:00<00:00,  6.11it/s]


upload complete for models/single/one_run_model_znorm_10004_08693.pkl


100%|██████████| 4/4 [00:00<00:00,  6.67it/s]


upload complete for models/single/one_run_model_znorm_10008_09924.pkl


100%|██████████| 4/4 [00:00<00:00,  6.60it/s]


upload complete for models/single/one_run_model_znorm_10009_08848.pkl


100%|██████████| 4/4 [00:00<00:00,  7.12it/s]


upload complete for models/single/one_run_model_znorm_10016_09694.pkl


100%|██████████| 4/4 [00:00<00:00,  7.04it/s]


upload complete for models/single/one_run_model_znorm_10017_08894.pkl


100%|██████████| 4/4 [00:00<00:00,  6.73it/s]


upload complete for models/single/one_run_model_znorm_10018_08907.pkl


100%|██████████| 4/4 [00:00<00:00,  5.19it/s]


upload complete for models/single/one_run_model_znorm_10021_08839.pkl


100%|██████████| 4/4 [00:00<00:00,  5.60it/s]


upload complete for models/single/one_run_model_znorm_10022_08854.pkl


100%|██████████| 4/4 [00:00<00:00,  5.20it/s]


upload complete for models/single/one_run_model_znorm_10023_09126.pkl


100%|██████████| 4/4 [00:00<00:00,  6.72it/s]


upload complete for models/single/one_run_model_znorm_10027_09455.pkl


100%|██████████| 4/4 [00:00<00:00,  6.57it/s]


upload complete for models/single/one_run_model_znorm_10033_08871.pkl


100%|██████████| 4/4 [00:00<00:00,  6.29it/s]


upload complete for models/single/one_run_model_znorm_10034_08879.pkl


100%|██████████| 4/4 [00:00<00:00,  5.46it/s]


upload complete for models/single/one_run_model_znorm_10035_08847.pkl


100%|██████████| 4/4 [00:00<00:00,  6.17it/s]


upload complete for models/single/one_run_model_znorm_10036_09800.pkl


100%|██████████| 4/4 [00:00<00:00,  6.94it/s]


upload complete for models/single/one_run_model_znorm_10037_09903.pkl


100%|██████████| 4/4 [00:00<00:00,  5.89it/s]


upload complete for models/single/one_run_model_znorm_10038_09063.pkl


100%|██████████| 4/4 [00:00<00:00,  4.92it/s]


upload complete for models/single/one_run_model_znorm_10039_08941.pkl


100%|██████████| 4/4 [00:00<00:00,  6.39it/s]


upload complete for models/single/one_run_model_znorm_10042_08990.pkl


100%|██████████| 4/4 [00:00<00:00,  5.53it/s]


upload complete for models/single/one_run_model_znorm_10043_09222.pkl


100%|██████████| 4/4 [00:00<00:00,  6.83it/s]


upload complete for models/single/one_run_model_znorm_10045_08968.pkl


100%|██████████| 4/4 [00:00<00:00,  6.69it/s]


upload complete for models/single/one_run_model_znorm_10046_09216.pkl


100%|██████████| 4/4 [00:00<00:00,  6.64it/s]


upload complete for models/single/one_run_model_znorm_10047_09030.pkl


100%|██████████| 4/4 [00:00<00:00,  7.02it/s]


upload complete for models/single/one_run_model_znorm_10050_09079.pkl


100%|██████████| 4/4 [00:00<00:00,  5.33it/s]


upload complete for models/single/one_run_model_znorm_10053_09018.pkl


100%|██████████| 4/4 [00:00<00:00,  6.09it/s]


upload complete for models/single/one_run_model_znorm_10056_09615.pkl


100%|██████████| 4/4 [00:00<00:00,  6.08it/s]


upload complete for models/single/one_run_model_znorm_10057_10124.pkl


100%|██████████| 4/4 [00:00<00:00,  5.81it/s]


upload complete for models/single/one_run_model_znorm_10060_09359.pkl


100%|██████████| 4/4 [00:00<00:00,  5.97it/s]


upload complete for models/single/one_run_model_znorm_10061_09308.pkl


100%|██████████| 4/4 [00:00<00:00,  7.08it/s]


upload complete for models/single/one_run_model_znorm_10065_09587.pkl


100%|██████████| 4/4 [00:00<00:00,  6.47it/s]


upload complete for models/single/one_run_model_znorm_10066_09687.pkl


100%|██████████| 4/4 [00:00<00:00,  5.07it/s]


upload complete for models/single/one_run_model_znorm_10069_09785.pkl


100%|██████████| 4/4 [00:00<00:00,  6.17it/s]


upload complete for models/single/one_run_model_znorm_10080_09931.pkl


100%|██████████| 4/4 [00:00<00:00,  5.50it/s]


upload complete for models/single/one_run_model_znorm_10084_10188.pkl


100%|██████████| 4/4 [00:00<00:00,  6.99it/s]


upload complete for models/single/one_run_model_znorm_30004_08965.pkl


100%|██████████| 4/4 [00:00<00:00,  7.51it/s]


upload complete for models/single/one_run_model_znorm_30008_08981.pkl


100%|██████████| 4/4 [00:00<00:00,  5.11it/s]


upload complete for models/single/one_run_model_znorm_30009_09227.pkl


100%|██████████| 4/4 [00:00<00:00,  5.92it/s]


upload complete for models/single/one_run_model_znorm_30011_09170.pkl


100%|██████████| 4/4 [00:00<00:00,  6.27it/s]


upload complete for models/single/one_run_model_znorm_30012_09102.pkl


100%|██████████| 4/4 [00:00<00:00,  5.96it/s]


upload complete for models/single/one_run_model_znorm_30014_09352.pkl


100%|██████████| 4/4 [00:00<00:00,  7.57it/s]


upload complete for models/single/one_run_model_znorm_30017_09567.pkl


100%|██████████| 4/4 [00:00<00:00,  5.25it/s]


upload complete for models/single/one_run_model_znorm_30020_09236.pkl


100%|██████████| 4/4 [00:00<00:00,  5.09it/s]


upload complete for models/single/one_run_model_znorm_30024_09398.pkl


100%|██████████| 4/4 [00:00<00:00,  5.36it/s]


upload complete for models/single/one_run_model_znorm_30025_09402.pkl


100%|██████████| 4/4 [00:00<00:00,  6.43it/s]


upload complete for models/single/one_run_model_znorm_30026_09430.pkl


100%|██████████| 4/4 [00:00<00:00,  4.82it/s]


upload complete for models/single/one_run_model_znorm_30027_09638.pkl


100%|██████████| 4/4 [00:00<00:00,  7.01it/s]


upload complete for models/single/one_run_model_znorm_30033_09776.pkl


100%|██████████| 4/4 [00:00<00:00,  5.54it/s]


upload complete for models/single/one_run_model_znorm_30035_09836.pkl


100%|██████████| 4/4 [00:00<00:00,  6.64it/s]


upload complete for models/single/one_run_model_znorm_30036_09758.pkl


100%|██████████| 4/4 [00:00<00:00,  6.84it/s]


upload complete for models/single/one_run_model_znorm_30038_09967.pkl


100%|██████████| 4/4 [00:00<00:00,  6.05it/s]


upload complete for models/single/one_run_model_znorm_30044_10095.pkl


100%|██████████| 4/4 [00:00<00:00,  4.69it/s]


upload complete for models/single/one_run_model_znorm_30045_10182.pkl


100%|██████████| 4/4 [00:00<00:00,  5.58it/s]


upload complete for models/single/one_run_model_znorm_30053_10112.pkl


TypeError: ignored

In [21]:
file_name_dict

In [None]:
def run_subject_model_cv(subject_ids, runs_train,runs_test,do_psc=True,norm='none'):
  cv_dict = {}
  for idx in range(len(subject_ids)):
    model_dict = {}
    user_data_dict, sub_id = load_subject_data(idx,do_psc)
    sub_data = user_data_dict[sub_id]
    sub_labels = user_data_dict[f"{sub_id}_rt_labels"]
    clf,val_acc,test_acc = run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm,do_cv=True)
    cv_dict[sub_id] = {}
    cv_dict[sub_id]['model'] = clf
    cv_dict[sub_id]['val_acc'] = val_acc
    cv_dict[sub_id]['test_acc'] = test_acc
   
  return cv_dict

In [None]:
cv_results = run_subject_model_cv(subject_ids,(2,),(3,4),do_psc=False,norm='RUNS')