<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/source/SingleSubjectSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Whole Brain Support Vector Machine Training
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training all subjects
- SVM training per subject

### Mount Google Drive and clone repository
- open to source directory

In [56]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount = True)

Mounted at /content/gdrive


In [57]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [58]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source
!ls


Cloning into 'teambrainiac'...
remote: Enumerating objects: 679, done.[K
remote: Counting objects: 100% (679/679), done.[K
remote: Compressing objects: 100% (495/495), done.[K
remote: Total 679 (delta 415), reused 342 (delta 168), pack-reused 0[K
Receiving objects: 100% (679/679), 68.65 MiB | 30.75 MiB/s, done.
Resolving deltas: 100% (415/415), done.
/content/teambrainiac/source/teambrainiac/source/teambrainiac/source/teambrainiac/source/teambrainiac/source
AccuracyMeasures.ipynb	  models
analysis.py		  process.py
cross_validation.py	  SingleSubjectSVM.ipynb
data			  SubjectVisualization_Models_ZNORM.ipynb
DL			  SVM_Group_Adolescent_Whole_brain.ipynb
Explore_data.ipynb	  SVM_Group_YA_Whole_brain.ipynb
explore.py		  TestMask.ipynb
Group_All_MASK_SVM.ipynb  train.py
helper			  utils.py
Images			  VisualizationPlayground.ipynb
__init__.py		  Visualize_Data.ipynb


### Load path_config.py 
- we are already in source so we can just load this file without chanding directory

In [65]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving path_config.py to path_config (1).py
User uploaded file "path_config.py" with length 228 bytes


### Load the Whole brain normalized masked all subject 2d pickle file 
- go to Drive outside of this notebook
  - create a folder named data
  - upload 'whole_brain_all_norm_2d.pkl' - will take 5 hours but should remain on system without need to upload again
  - once uploaded, drag the file to the repo source/data directory

### Import libraries


In [66]:

# Import libraries
!pip install boto3 nilearn nibabel
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pickle
from utils import *

from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import random
#import cv2 as cv
#import scipy.io



### Get paths to subject data and grab labels for SVM

In [89]:

## load and open the pickle file that contains paths to all data.
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)

In [90]:
def get_subj_information(data_path_dict):
  subject_ids = data_path_dict['subject_ID']
  subj_paths = data_path_dict['subject_data']
  return subject_ids,subj_paths
def get_labels(data_path_dict):

  label_data_path = data_path_dict['labels'][0]
  label_type = 'rt_labels'
  mask_labels_indices, binary_labels = labels_mask_binary(label_data_path, label_type)
  return mask_labels_indices, binary_labels

In [91]:
def get_mask(mask_type,data_path_dict,mask_ind):
  """
  """
  mask_data_filepath = data_path_dict['mask_data'][mask_ind] #path to masked data     
  mask_type_dict = access_load_data(mask_data_filepath, True)
  np_array_mask = mask_type_dict[mask_type]
  mask = np.ma.make_mask(np_array_mask).reshape(79*95*79,order='F')

  return mask                                              

## Set up SVM Model

In [98]:
def load_subject_data(subj_paths,subject_ids,idx,dopsc,mask,mask_labels_indices,binary_labels):
  """
    Function to load indivdual subject data. Calls masking data which is in utils.py
      subj_paths  :paths to subject data
      subject_ids :list of subjects
      idx   : index of current subject
      dopsc : If True, do percent signal change before returning user_data_dict
      mask  : mask to use on subject
      mask_labels_indices : indices of brain volumes we want to keep (aligning to increase/decrease)
      binary_labels: labels of increase/decrease to append to user dictionary
      returns: user data dictionary that contains all four runs masked to just keep voxels of interest along with labels
               along with the subjects id
  """
  mat_path = subj_paths[idx]
  sub_id = subject_ids[idx]
  data = access_load_data(mat_path,True)
  user_data_dict = {}
  user_data_dict[sub_id], bi_lb = masking_data(data, mask, mask_labels_indices, binary_labels, dopsc)
  user_data_dict[f"{sub_id}_{label_type}"] = bi_lb
  return user_data_dict,sub_id

In [75]:
def scale_data_single_subj(sub_data,sub_labels, runs_train,runs_test,norm):
  """
    Function to scale data. Flexible to accomadate multiple different schemas to test which normalization is best.
      sub_data     : (1 subject data, keys as subject ID for frmi data or labels)
      sub_labels   : sub_labels to indicate which row of the sub_data belongs to increase/decrease state
      runs_train   : tuple , (which run are we using for the training data)
                            If runs_test = 1, there will be no X_val, y_val
      runs_test    : tuple, (which run are we using for the test data)
      norm         : string, ("RUNS": normalizing separately on each run;
                              "SUBJECT": Normalizing separately by each subject)
      returns      : nd.arrays, Concatenated X data of (time points, x*y*z) x = 79, y = 95, z = 75
                    and Concatenated y labels of (time points,)
    """

  # train and labels
  X = []
  y = []
  ##val and labels
  Xv = []
  yv = []
  # TEST and labels
  Xt = []
  yt = []
  ## decide how to concatenate runs
  if len(runs_train)>1:
      for run in runs_train:
        X.append(sub_data[run])
        y.append(sub_labels[run])
      
      X = np.concatenate(np.array(X))
      y = np.concatenate(np.array(y))
  else:
      X = sub_data[runs_train[0]-1]
      y = sub_labels[runs_train[0]-1]
  if len(runs_test)> 1:
      Xv = sub_data[runs_test[0]-1]
      yv = sub_labels[runs_test[0]-1]
      Xt = sub_data[runs_test[1]-1]
      yt = sub_labels[runs_test[1]-1]
  else:
      Xt = sub_data[runs_test[0]-1]
      yt = sub_labels[runs_test[0]-1]
  ##run standardization
  if norm == "RUNS":
      scalar = StandardScaler()
      X = scalar.fit_transform(X)
      scalarT = StandardScaler()
      Xt = scalarT.fit_transform(Xt)
      if len(Xv)>0:
        Xv = scalarT.fit_transform(Xv)
  elif norm == "SUBJECT":
      scalar = StandardScaler().fit(X)
      X = scalar.transform(X)
      

      Xt = scalar.transform(Xt)
      if len(Xv)>0:
        Xv = scalar.transform(Xv)
  else:
      print('Not doing standardization')      
  return X, y, Xt, yt, Xv, yv

In [86]:
def run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm="none",do_cv=False):
  """
    Function to run cross-validation or single subject SVM
      sub_data     : (1 subject data, keys as subject ID for frmi data or labels)
      sub_labels   : sub_labels to indicate which row of the sub_data belongs to increase/decrease state
      runs_train   : tuple , (which run are we using for the training data)
                            If runs_test = 1, there will be no X_val, y_val
      runs_test    : tuple, (which run are we using for the test data)
      norm         : string, ("RUNS": normalizing separately on each run;
                              "SUBJECT": Normalizing separately by each subject
                              "none: no normalization will be done)
      do_cv:       : If True, run cross-validation
      returns      : subject individual model, Training data/labels, Val data/labels, Test data/labels
  """
  
  #get scaled data
  X_train, y_train, X_test, y_test, X_val, y_val = scale_data_single_subj(sub_data,sub_labels,runs_train,runs_test,norm)
  #run cv if do_cv = True, else run individual model SVM
  if do_cv:
    c_params = {'C':[0.7, 1, 5, 10],'kernel':['linear', 'rbf']}
    svc = SVC()
    clf = GridSearchCV(svc, c_params)
    clf.fit(X_train, y_train)
    return clf
  else:
    clf = SVC(C=10)
    clf.fit(X_train,y_train)

  return clf,X_train,y_train,X_test,y_test,X_val,y_val

In [96]:

def run_subject_model(data_path_dict,runs_train,runs_test,mask_type,mask_ind,do_psc=True,norm='none'):
  #get subject information
  subject_ids,subj_paths = get_subj_information(data_path_dict)
  mask_labels_indices,binary_labels = get_labels(data_path_dict)
  mask = get_mask(mask_type,data_path_dict,mask_ind)
  single_model_path={}
  for idx in range(len(subject_ids)):
    model_dict = {}
    user_data_dict, sub_id = load_subject_data(subj_paths,subject_ids,idx,do_psc,mask,mask_labels_indices,binary_labels)
    sub_data = user_data_dict[sub_id]
    sub_labels = user_data_dict[f"{sub_id}_rt_labels"]
    clf,X_train, y_train, X_test, y_test, X_val, y_val = run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm)
    model_dict[sub_id] = {}
    model_dict[sub_id]['model'] = clf
    model_dict[sub_id]['X_train'] = X_train
    model_dict[sub_id]['y_train'] = y_train
    model_dict[sub_id]['X_test'] = X_test
    model_dict[sub_id]['y_test'] = y_test
    if len(X_val)>0:
      model_dict[sub_id]['X_val'] = X_val
      model_dict[sub_id]['y_val'] = y_val
    destination_path = f'/content/drive/My Drive/data/model_new_mask/one_run_model_znorm_{sub_id}.pkl'
    filehandler = open(destination_path,"wb")
    pickle.dump(model_dict,filehandler)
    filehandler.close()
   
    # single_model_path[sub_id] = f'models/single/one_run_model_znorm_{sub_id}.pkl'
    # object_name = f"models/single/one_run_model_znorm_{sub_id}.pkl"
    # upload = s3_upload(model_dict,object_name,"pickle")
  return single_model_path

In [99]:
file_name_dict = run_subject_model(data_path_dict,(2,),(3,4),'mask',0,do_psc=False,norm='RUNS')


100%|██████████| 4/4 [00:00<00:00,  4.81it/s]
100%|██████████| 4/4 [00:00<00:00,  4.50it/s]
100%|██████████| 4/4 [00:00<00:00,  4.71it/s]
100%|██████████| 4/4 [00:01<00:00,  3.17it/s]
100%|██████████| 4/4 [00:00<00:00,  5.12it/s]
100%|██████████| 4/4 [00:00<00:00,  4.54it/s]
100%|██████████| 4/4 [00:01<00:00,  3.66it/s]
100%|██████████| 4/4 [00:00<00:00,  4.34it/s]
100%|██████████| 4/4 [00:00<00:00,  4.25it/s]
100%|██████████| 4/4 [00:01<00:00,  3.35it/s]
100%|██████████| 4/4 [00:01<00:00,  3.58it/s]
100%|██████████| 4/4 [00:01<00:00,  3.70it/s]
100%|██████████| 4/4 [00:01<00:00,  4.00it/s]
100%|██████████| 4/4 [00:00<00:00,  4.71it/s]
100%|██████████| 4/4 [00:00<00:00,  5.09it/s]
100%|██████████| 4/4 [00:01<00:00,  3.85it/s]
100%|██████████| 4/4 [00:00<00:00,  5.07it/s]
100%|██████████| 4/4 [00:00<00:00,  4.34it/s]
100%|██████████| 4/4 [00:00<00:00,  4.13it/s]
100%|██████████| 4/4 [00:01<00:00,  3.92it/s]
100%|██████████| 4/4 [00:00<00:00,  4.27it/s]
100%|██████████| 4/4 [00:01<00:00,

In [None]:
file_name_dict

In [None]:
def run_subject_model_cv(subject_ids, runs_train,runs_test,do_psc=True,norm='none'):
  cv_dict = {}
  for idx in range(len(subject_ids)):
    model_dict = {}
    user_data_dict, sub_id = load_subject_data(idx,do_psc)
    sub_data = user_data_dict[sub_id]
    sub_labels = user_data_dict[f"{sub_id}_rt_labels"]
    clf,val_acc,test_acc = run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm,do_cv=True)
    cv_dict[sub_id] = {}
    cv_dict[sub_id]['model'] = clf
    cv_dict[sub_id]['val_acc'] = val_acc
    cv_dict[sub_id]['test_acc'] = test_acc
   
  return cv_dict

In [None]:
cv_results = run_subject_model_cv(subject_ids,(2,),(3,4),do_psc=False,norm='RUNS')