<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/source/DataExploration_SingleSubj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Whole Brain Support Vector Machine Training
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training per subject

### Mount Google Drive and clone repository
- open to source directory

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount = True)

Mounted at /content/gdrive


In [2]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source
!ls


Cloning into 'teambrainiac'...
remote: Enumerating objects: 757, done.[K
remote: Counting objects: 100% (757/757), done.[K
remote: Compressing objects: 100% (554/554), done.[K
remote: Total 757 (delta 470), reused 380 (delta 187), pack-reused 0[K
Receiving objects: 100% (757/757), 72.37 MiB | 30.89 MiB/s, done.
Resolving deltas: 100% (470/470), done.
/content/teambrainiac/source
access_data.py		  process.py
AccuracyMeasures.ipynb	  SingleSubjectSVM.ipynb
analysis.py		  SubjectVisualization_Models_ZNORM.ipynb
cross_validation.py	  SVM_Group_Adolescent_Whole_brain.ipynb
data			  SVM_Group_YA_Whole_brain.ipynb
DL			  TestMask.ipynb
Explore_data.ipynb	  train.py
Group_All_MASK_SVM.ipynb  utils.py
helper			  VisualizationPlayground.ipynb
Images			  Visualize_Data.ipynb
__init__.py		  visualize.py
models


### Load path_config.py 
- we are already in source so we can just load this file without changing directory

In [4]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving path_config.py to path_config.py
User uploaded file "path_config.py" with length 228 bytes


### Import libraries


In [36]:

# Import libraries
!pip install boto3 nilearn nibabel #for saving data
import pickle
#sklearn packages needed
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, auc, recall_score, precision_score,roc_curve,f1_score
#important utility functions for loading,masking,saving data
from utils import *
#normal python packages we use
import numpy as np
import pandas as pd




### Get paths to subject data and grab labels for SVM

In [8]:

## load and open the pickle file that contains paths to all data.
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)

###Functions to get information about data to run our SVM

In [9]:
def get_subj_information(data_path_dict):
  """
    Function to get subject information.
    data_path_dict  : dictionary containing paths to all data stored on AWS
    returns:  subject_ids(list of subjects to run),subj_paths(paths to subject raw data)
  """
  subject_ids = data_path_dict['subject_ID'] #subject_ids
  subj_paths = data_path_dict['subject_data'] #subject_paths
  return subject_ids,subj_paths

def get_labels(data_path_dict):
  """
    Function to get the labels for our data.
    data_path_dict  : dictionary containing paths to all data stored on AWS
    returns: mask_labels_indices(timepoints we want masked out),binary_labels(labels for our for our two brain states)
             and label_type
  """
  
  label_data_path = data_path_dict['labels'][0] #get labels
  label_type = 'rt_labels' #tell the function what labels we want
  mask_labels_indices, binary_labels = labels_mask_binary(label_data_path, label_type) #grab indices and labels
  return mask_labels_indices, binary_labels,label_type

def get_mask(mask_type,data_path_dict,mask_ind):
  """
    Function to return the mask of what brain voxels we want to include in analysis
    data_path_dict  : dictionary containing paths to all data stored on AWS
    mask_type: name of mask we want to use
    mask_ind: index of where the path to the masks are 0: full brain mask plus masks that subtract region
              1: Regions of interest(ROIs) mask out full brain except structure we care about
  """
  mask_data_filepath = data_path_dict['mask_data'][mask_ind] #path to masked data     
  mask_type_dict = access_load_data(mask_data_filepath, True) #get the mask data dictionary
  np_array_mask = mask_type_dict[mask_type] #get the mask array
  mask = np.ma.make_mask(np_array_mask).reshape(79*95*79,order='F') #create a 1-D array for the mask. Important to use Fourier Transformation as we are working in brain space!

  return mask

In [10]:
mask_data_filepath = data_path_dict['mask_data'][0] #path to masked data  
mask_type_dict = access_load_data(mask_data_filepath, True)
mask_type_dict.keys()

dict_keys(['__header__', '__version__', '__globals__', 'mask', 'masksubACC', 'masksubAI', 'masksubNAcc', 'masksubmPFC'])

## Set up SVM Model

In [11]:
def load_subject_data(subj_paths,subject_ids,idx,dopsc,mask,mask_labels_indices,binary_labels,label_type):
  """
    Function to load indivdual subject data. Calls masking data which is in utils.py
      subj_paths  :paths to subject data
      subject_ids :list of subjects
      idx   : index of current subject
      dopsc : If True, do percent signal change before returning user_data_dict
      mask  : mask to use on subject
      mask_labels_indices : indices of brain volumes we want to keep (aligning to increase/decrease)
      binary_labels: labels of increase/decrease to append to user dictionary
      label_type: type of label we are applying
      returns: user data dictionary that contains all four runs masked to just keep voxels of interest along with labels
               along with the subjects id
  """
  mat_path = subj_paths[idx] #get raw data path
  sub_id = subject_ids[idx] #get subject id
  data = access_load_data(mat_path,True) #call function to load data and return a dictionary. We are loading .mat files so need to set second param to True
  user_data_dict = {} #create empty dict
  user_data_dict[sub_id], bi_lb = masking_data(data, mask, mask_labels_indices, binary_labels, dopsc) #call function to mask the data and do normalization if desired
  user_data_dict[f"{sub_id}_{label_type}"] = bi_lb #set the binary labels for our data (0 = decrease,1=increase)
  return user_data_dict,sub_id

In [12]:
def scale_data_single_subj(sub_data,sub_labels, runs_train,runs_test,norm):
  """
    Function to scale data. Flexible to accomadate multiple different schemas to test which normalization is best.
      sub_data     : (1 subject data, keys as subject ID for frmi data or labels)
      sub_labels   : sub_labels to indicate which row of the sub_data belongs to increase/decrease state
      runs_train   : tuple , (which run are we using for the training data)
                            If runs_test = 1, there will be no X_val, y_val
      runs_test    : tuple, (which run are we using for the test data)
      norm         : string, ("RUNS": normalizing separately on each run;
                              "SUBJECT": Normalizing separately by each subject)
      returns      : nd.arrays, Concatenated X data of (time points, x*y*z) x = 79, y = 95, z = 75
                    and Concatenated y labels of (time points,)
    """

  # train and labels
  X = []
  y = []
  ##val and labels
  Xv = []
  yv = []
  # TEST and labels
  Xt = []
  yt = []
  ## decide how to concatenate runs
  if len(runs_train)>1:
      for run in runs_train:
        X.append(sub_data[run])
        y.append(sub_labels[run])
      
      X = np.concatenate(np.array(X))
      y = np.concatenate(np.array(y))
  else:
      X = sub_data[runs_train[0]-1]
      y = sub_labels[runs_train[0]-1]
  if len(runs_test)> 1:
      Xv = sub_data[runs_test[0]-1]
      yv = sub_labels[runs_test[0]-1]
      Xt = sub_data[runs_test[1]-1]
      yt = sub_labels[runs_test[1]-1]
  else:
      Xt = sub_data[runs_test[0]-1]
      yt = sub_labels[runs_test[0]-1]
  ##run standardization
  if norm == "RUNS":
      scalar = StandardScaler()
      X = scalar.fit_transform(X)
      scalarT = StandardScaler()
      Xt = scalarT.fit_transform(Xt)
      if len(Xv)>0:
        Xv = scalarT.fit_transform(Xv)
  elif norm == "SUBJECT":
      scalar = StandardScaler().fit(X)
      X = scalar.transform(X)
      

      Xt = scalar.transform(Xt)
      if len(Xv)>0:
        Xv = scalar.transform(Xv)
  else:
      print('Not doing standardization')      
  return X, y, Xt, yt, Xv, yv

In [19]:
def run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm="none",do_cv=False):
  """
    Function to run cross-validation or single subject SVM
      sub_data     : (1 subject data, keys as subject ID for frmi data or labels)
      sub_labels   : sub_labels to indicate which row of the sub_data belongs to increase/decrease state
      runs_train   : tuple , (which run are we using for the training data)
                            If runs_test = 1, there will be no X_val, y_val
      runs_test    : tuple, (which run are we using for the test data)
      norm         : string, ("RUNS": normalizing separately on each run;
                              "SUBJECT": Normalizing separately by each subject
                              "none: no normalization will be done)
      do_cv:       : If True, run cross-validation
      returns      : subject individual model, Training data/labels, Val data/labels, Test data/labels
  """
  
  #get scaled data
  X_train, y_train, X_test, y_test, X_val, y_val = scale_data_single_subj(sub_data,sub_labels,runs_train,runs_test,norm)
  #run cv if do_cv = True, else run individual model SVM
  if do_cv:
    c_params = {'C':[0.7, 1, 5, 10],'kernel':['linear', 'rbf']}
    svc = SVC()
    clf = GridSearchCV(svc, c_params)
    clf.fit(X_train, y_train)
    return clf
  else:
    clf = SVC(C=10)
    clf.fit(X_train,y_train)

  return clf,X_train,y_train,X_test,y_test,X_val,y_val

In [26]:
def get_accuracy_scores(model_dict,subj,normalization_by,normalization_type):
  clf = model_dict[subj]['model']
  X_train = model_dict[subj]['X_train']
  y_train = model_dict[subj]['y_train']
  X_test = model_dict[subj]['X_test'] 
  y_test = model_dict[subj]['y_test']
  if 'X_val' in model_dict[subj].keys():
    X_val = model_dict[subj]['X_val']
    y_val = model_dict[subj]['y_val']
    y_val_predicts = clf.predict(X_val)
    val_acc = accuracy_score(y_val,y_val_predicts)
    fpr,tpr,thresholds = roc_curve(y_val,y_val_predicts)
    val_auc = auc(fpr,tpr)
    val_precision = precision_score(y_val,y_val_predicts)
    val_recall = recall_score(y_val,y_val_predicts)
    val_f1 = f1_score(y_val,y_val_predicts)
  else:
    y_val_predicts = 0
    val_acc = 0
  train_predicts = clf.predict(X_train) 
  y_test_predicts = clf.predict(X_test)
  tr_acc = accuracy_score(y_train,train_predicts)   
  test_acc = accuracy_score(y_test,y_test_predicts)
  fpr,tpr,thresholds = roc_curve(y_test,y_test_predicts)
  test_auc = auc(fpr,tpr)
  test_precision = precision_score(y_test,y_test_predicts)
  test_recall = recall_score(y_test,y_test_predicts)
  test_f1 = f1_score(y_test,y_test_predicts)
  subj_list = [subj,tr_acc,val_acc,val_auc,val_precision,val_recall,val_f1, test_acc,test_auc,test_precision,test_recall,test_f1,normalization_by,normalization_type]
  return subj_list

In [60]:

def run_subject_model(data_path_dict,path,file_name,runs_train,runs_test,mask_type,mask_ind,do_psc=True,norm='none',normtype='none'):
  """
    Function loops over subjects to grap data,scale,data,and run the SVM.
    data_path_dict  : dictionary containing paths to all data stored on AWS
    path : path to save pickle files. NOTE: If run on all subjects, this will take up about 20 GB
    file_name : name of file to prepend to subject id
    runs_train: what runs do we want to train on
    runs_test: what runs do we want to test on
    mask_type: which type of brain mask do we want to apply
    mask_ind: to distinguish between ROI regions of masking(1) and full brain(or full_brain minus ROIs)(0)
    do_psc: do we want to apply Percent Signal Change normalization. Default = True,
    norm: Takes three parameters: none(no normalization will be applied),"SUBJECT"(apply normalization per subject)
          "RUNS"(apply normalization per run separately)
  """
  pic_list = ['10047_09030','30017_09567']
  pic_files = []
  #get subject information
  sub_acc = []
  subject_ids,subj_paths = get_subj_information(data_path_dict)
  mask_labels_indices,binary_labels,label_type = get_labels(data_path_dict)
  mask = get_mask(mask_type,data_path_dict,mask_ind)
  for idx in range(len(subject_ids)):
    model_dict = {}
    user_data_dict, sub_id = load_subject_data(subj_paths,subject_ids,idx,do_psc,mask,mask_labels_indices,binary_labels,label_type)
    sub_data = user_data_dict[sub_id]
    sub_labels = user_data_dict[f"{sub_id}_rt_labels"]
    clf,X_train, y_train, X_test, y_test, X_val, y_val = run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm)
    model_dict[sub_id] = {}
    model_dict[sub_id]['model'] = clf
    model_dict[sub_id]['X_train'] = X_train
    model_dict[sub_id]['y_train'] = y_train
    model_dict[sub_id]['X_test'] = X_test
    model_dict[sub_id]['y_test'] = y_test
    if len(X_val)>0:
      model_dict[sub_id]['X_val'] = X_val
      model_dict[sub_id]['y_val'] = y_val
    sub_accuracy = get_accuracy_scores(model_dict,sub_id,norm,normtype)
    sub_acc.append(sub_accuracy)
  
    if sub_id in pic_list:
      sub_destination_path = f'{path}{file_name}_{sub_id}.pkl'
      pic_files.append(sub_destination_path)
      filehandler = open(sub_destination_path,"wb")
      pickle.dump(model_dict,filehandler)
      filehandler.close()
    
  #   single_model_path[sub_id] = destination_path
  #   object_name = destination_path
  #   upload = s3_upload(model_dict,object_name,"pickle")
  # object_name = f'{path}{mask_type}_data_path_dict.pkl'
  # upload = s3_upload(model_dict,object_name,"pickle")
  destination_path = f'{path}{file_name}.csv'
  sub_acc_df = pd.DataFrame(sub_acc, columns = ["Subject","TrainAcc","ValAcc","ValAUC","ValPrecision","ValRecall","ValF1",
                                                 "TestAcc","TestAUC","TestPrecision","TestRecall","TestF1","NormBy","NormType"])
  sub_acc_df.to_csv(destination_path)
  return destination_path,pic_files

In [59]:
csv_paths = []

In [None]:

save_data_path = '/content/drive/My Drive/data/dataexploration/'
file_name = 'NoNORM_run_2_t_3_4'
file_path,sub_files = run_subject_model(data_path_dict,save_data_path,file_name,(2,),(3,4),'mask',0,do_psc=False,norm='none',normtype='NONORM')
csv_paths.append([file_path])
csv_paths.append(sub_files)

In [70]:
csv_path_df = pd.DataFrame(csv_paths,columns=['metric_scores','pickle_files'])
csv_path_df.to_csv(f'{save_data_path}paths_to_metrics_pickle.csv')

In [71]:
def run_subject_model_cv(subject_ids, runs_train,runs_test,do_psc=True,norm='none'):
  cv_dict = {}
  for idx in range(len(subject_ids)):
    model_dict = {}
    user_data_dict, sub_id = load_subject_data(idx,do_psc)
    sub_data = user_data_dict[sub_id]
    sub_labels = user_data_dict[f"{sub_id}_rt_labels"]
    clf,val_acc,test_acc = run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm,do_cv=True)
    cv_dict[sub_id] = {}
    cv_dict[sub_id]['model'] = clf
    cv_dict[sub_id]['val_acc'] = val_acc
    cv_dict[sub_id]['test_acc'] = test_acc
   
  return cv_dict

In [None]:
cv_results = run_subject_model_cv(subject_ids,(2,),(3,4),do_psc=False,norm='RUNS')