<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/source/SingleSubjectSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Whole Brain Support Vector Machine Training
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training per subject

### Mount Google Drive and clone repository
- open to source directory

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount = True)

Mounted at /content/gdrive


In [2]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source
!ls


### Load path_config.py 
- we are already in source so we can just load this file without changing directory

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

### Import libraries


In [None]:


# Import libraries
!pip install boto3 nilearn nibabel #for saving data and image visualizations
import pickle
#sklearn packages needed
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, auc, recall_score, precision_score,roc_curve,f1_score
#important utility functions for loading,masking,saving data
#from utils import *
from access_data import *
from process import *
#normal python packages we use
import numpy as np
import pandas as pd
from scipy import signal
from nilearn.signal import clean

### Get paths to subject data and grab labels for SVM

In [7]:

## load and open the pickle file that contains paths to all data.
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)

##get mask_dictionary


###Functions to get information about data to run our SVM

In [8]:
def get_data_dict(path):
  """
    Function to get data path dict
      params:
        path : str: path to data path dictionary
      returns: dictionary of data paths
  """
  data_path_dict = open_pickle(path)
  return data_path_dict

def get_subj_information(data_path_dict):
  """
    Function to get subject information.
    data_path_dict  : dictionary containing paths to all data stored on AWS
    returns:  subject_ids(list of subjects to run),subj_paths(paths to subject raw data)
  """
  subject_ids = data_path_dict['subject_ID'] #subject_ids
  subj_paths = data_path_dict['subject_data'] #subject_paths
  return subject_ids,subj_paths

def get_labels(data_path_dict):
  """
    Function to get the labels for our data.
    data_path_dict  : dictionary containing paths to all data stored on AWS
    returns: mask_labels_indices(timepoints we want masked out),binary_labels(labels for our for our two brain states)
             and label_type
  """
  
  label_data_path = data_path_dict['labels'][0] #get labels
  label_type = 'rt_labels' #tell the function what labels we want
  mask_labels_indices, binary_labels = labels_mask_binary(label_data_path, label_type) #grab indices and labels
  return mask_labels_indices, binary_labels,label_type

def get_mask_data(data_path_dict,mask_ind):
  """
    Function to return the mask of what brain voxels we want to include in analysis
    Params:
      data_path_dict  : dictionary: containing paths to data
      mask_ind: int: index of where the path to the masks are 0: full brain mask plus masks that subtract region
                1: Regions of interest(ROIs) mask out full brain except structure we care about
    returns: dictionary: contains mask data
    
  """
  mask_data_filepath = data_path_dict['mask_data'][mask_ind] #path to masked data     
  mask_type_dict = access_load_data(mask_data_filepath, True) #get the mask data dictionary
  
  return mask_type_dict

In [9]:
def make_mask(np_array_mask):
  """
    Function to create boolean mask to mask out voxels we don't want
    Params:
      mask_type: string: which mask to grab to get boolean array
    returns: boolean array of voxels to include
  """
  #np_array_mask = mask_data[mask_type] #get the mask array
  #create a 1-D array for the mask. Important to use Fourier Transformation as we are working in brain space!
  mask = np.ma.make_mask(np_array_mask).reshape(79*95*79,order='F')
  return mask

## Set up SVM Model

In [10]:
def mask_subject_data(data,mask,mask_labels_indices):
  """
    Function to mask user data to mask out voxels we don't want
    Params:
      data: dictionary: subject data dictionary contain 4 runs of unmasked data
      mask: nd.array: 1-d array boolean values used to only include voxels we want.
      mask_labels_indices: indices of rows we want in to include in our model
    returns: dictionary: includes 4 runs of masked data
  """
  user_data_dict = {} #create empty dict
  arr = []
  for i in tqdm.tqdm(range(4)):
      user_key = 'run_0' + str(i+1) + '_vec'
      array = data[user_key]
      array_masked = array[:, mask]
      array_masked = array_masked[mask_labels_indices]  
      arr.append(array_masked)
  user_data_dict['data'] = arr
  return user_data_dict

In [11]:
def scale_data_single_subj(sub_data,train_runs,test_runs,norm='none'):
  """
    Function to scale data.
    Params:
      sub_data     : (1 subject data, keys as subject ID for frmi data or labels)
      sub_id       : subject id  of subject we are normalizing for
      runs_test    : tuple, (which run are we using for the test data)
      norm         : list, ("RUNS": normalizing separately on each run;
                              "SUBJECT": Normalizing separately by each subject)
    returns      : dictionary of nd.arrays, Concatenated X data of (time points, x*y*z) x = 79, y = 95, z = 75
                    and Concatenated y labels of (time points,)
    """
  ##run standardization
  ##initialize empty dictionary
  normalized_runs = {}
  for run in runs_list:
    run_name = user_key = 'run_0' + str(run) 
    run_data = sub_data['data'][run-1]
    if norm=='none':
      normalized_runs[run_name] = clean(run_data,detrend=True,standardize=False,filter=False,standardize_confounds=False)
    else:
      normalized_runs[run_name] = clean(run_data,detrend=True,standardize=norm,filter=False,standardize_confounds=False)
  return normalized_runs

In [12]:
def get_accuracy_scores(clf,data,X_train,y_train,runs_test,y_labels):
  """
    Function to get accuracy scores for subject models.
    Params:
      model_dict: contains subject model and training/test/val/data
      subj: subject name 
      normalization_type: options: 'PSC','ZNORM','none' what type of normalization
    returns: subj_list, list of subject metrics
  """
  accuracy_list = []
  df_columns = ['train_acc']
  y_predicts = clf.predict(X_train)
  accuracy_list.append(accuracy_score(y_train,y_predicts))
  for run in runs_test:
    y_predicts = clf.predict(data[run])
    df_columns.append(run + '_acc')
    accuracy_list.append(accuracy_score(y_labels,y_predicts))
    df_columns.append(run+'_f1_score')
    accuracy_list.append(f1_score(y_labels,y_predicts))
    
    
  return accuracy_list,df_columns

In [20]:
def get_predicts(clf,data,runs_test):
  """
    Function to get accuracy scores for subject models.
    Params:
      model_dict: contains subject model and training/testdata
      subj: subject name 
    returns: y_val_predicts(if validation run),y_test_predicts
  """
  predictions_dict = {}
  for runs in runs_test:
    predictions_dict[runs] = {}
    predictions_dict[runs]['predicts'] = clf.predict(data[runs])
    predictions_dict[runs]['proba'] = clf.predict_proba(data[runs])
    predictions_dict[runs]['decision_function'] = clf.decision_function(data[runs])
  
                                                  
  return predictions_dict

In [14]:
def run_single_subject_svm(data,runs_train,train_labels,svc_kernel='rbf',svc_c=1,do_cv=False,params={}):
  """
    Function to run cross-validation or single subject SVM
    Params:
      tuple: contains
        X_train      : 2-d array of training data
        y_train   : sub_labels to indicate which row of the sub_data belongs to increase/decrease state
        svc_kernel : kernel for svc
        svc_c: c value for svc
      optionals:
        do_cv: boolean: to decide if cross-validation gridsearch is requested: default=False
        params: dictionary: dictionary containing params to grid search: default=empty dictionary
    returns      : subject individual model
  """ 
  #run cv if do_cv = True, else run individual model SVM
  X_train = []
  y_train = []
  if len(runs_train)>1:
        for run in runs_train:
          X_train.append(data[run])
          y_train.append(train_labels)    
        X_train = np.concatenate(np.array(X_train))
        y_train = np.concatenate(np.array(y_train))
  else:
    X_train = data[runs_train[0]]
    y_train = train_labels
  if do_cv:
    #cv_params = {'C':[0.7, 1, 5, 10],'kernel':['linear', 'rbf']}
    svc = SVC()
    clf = GridSearchCV(svc, params)
    clf.fit(X_train,y_train)
    return clf
  else:
    clf = SVC(C=svc_c,kernel=svc_kernel,probability=True)
    clf.fit(X_train,y_train)
  return clf,X_train,y_train

In [None]:

###running model with best params across all masks
##what runs do you want to normalize on
runs_train=['run_02'] #runs we want to train on
runs_test=['run_03','run_04'] #runs we want to test on
runs_list=[2,3,4] #specify runs we want to normalize
norm_type = 'zscore' #specify normalization
svc_kernel='rbf' #specify kernel 
svc_c = 1 #specify c parameter
save_data_path = f'/content/drive/My Drive/data/singlesubjectmodels/' #where we want to store our models
#masks we want to run model on, needs to be nested list for cell to run
mask_list = [['mask']] 
#indices of the masks we want 0 = whole brain mask and masks minus ROIs, 1 = ROIs
mask_indices = [0] #indices of the masks we want 0 = whole brain mask and masks minus ROIs, 1 = ROIs
#get subject information
subjs_id, subjs_paths = get_subj_information(data_path_dict)
#get mask labels to only retrieve time series we care about
mask_labels_indices,binary_labels,label_type = get_labels(data_path_dict)
subj_mask_model = {}
subj_mask_model['data'] = {}
for idx in range(len(subjs_id)):
  subj_id = subjs_id[idx]
  subj_path = subjs_paths[idx]
  subj_data = access_load_data(subj_path,True)
  for midx in mask_indices:
    mask_dict = get_mask_data(data_path_dict,midx)
    masks = mask_list[midx]
    for mask_type in masks:
      subj_mask_model['data'][subj_id] = {}
      mask = make_mask(mask_dict[mask_type])
      masked_data = mask_subject_data(subj_data,mask,mask_labels_indices)
      scaled_data = scale_data_single_subj(masked_data,runs_train,runs_test,norm='zscore')
      clf,X_train,y_train = run_single_subject_svm(scaled_data,runs_train,binary_labels,svc_kernel,svc_c)
      subj_mask_model['data'][subj_id]['model'] = clf
      subj_mask_model['data'][subj_id]['X_train'] = X_train
      subj_mask_model['data'][subj_id]['y_train'] = y_train
      subj_mask_model['data'][subj_id]['predicts'] = get_predicts(clf,scaled_data,runs_test)
      full_path_name = f'{save_data_path}{mask_type}_subject_models.pkl'
      filehandler = open(full_path_name,"wb")
      pickle.dump(subj_mask_model,filehandler)
      filehandler.close()
  


In [None]:

###data exploration for different normalization strategies
##what runs do you want to normalize on
runs_train=['run_01','run_02'] #runs we want to train on
runs_test=['run_03','run_04'] #runs we want to test on
runs_list=[1,2,3,4]
##for fMRI, we always want to detrend the data
norm_list = ['psc','zscore','none'] #list of normalization strategies you want to test
save_subject_data =  ['10047_09030','30017_09567'] #specify subjects you want to save for normalization visualizations
svc_kernel='rbf' #specify kernel 
svc_c = 1 #specify c parameter
mask_dict = get_mask_data(data_path_dict,0) #get mask_dict
mask = make_mask(mask_dict['mask']) #create mask
#get subject information
subjs_id, subjs_paths = get_subj_information(data_path_dict)
#get mask labels to only retrieve time series we care about
mask_labels_indices,binary_labels,label_type = get_labels(data_path_dict)
###initialize variables to hold results
model_dict = {}
results = []
df_columns = ['subject_id','norm_type']
#loop over norm_type to get initialize model_dict norm_type dictionary
for norm_type in norm_list:
  model_dict[norm_type] = {}
#for loop to loop over subjects
for idx in range(len(subjs_id)):
  subj_id = subjs_id[idx] #get subj_id
  subj_path = subjs_paths[idx] #get subj_path
  subj_data = access_load_data(subj_path,True) #get subj_data
  masked_data = mask_subject_data(subj_data,mask,mask_labels_indices) #mask subject data
  #loop over normalizations to do
  for norm_type in norm_list:
    norm_results = [subj_id,norm_type] #store subject id and normalization strategy in results list
    scaled_data = scale_data_single_subj(masked_data,runs_train,runs_test,norm=norm_type) #scale the data for runs listed in runs_lit
    clf,X_train,y_train = run_single_subject_svm(scaled_data,runs_train,binary_labels,svc_kernel,svc_c) #run the model
    if subj_id in save_subject_data:    #if subject in list of subjects to save
      model_dict[norm_type][subj_id] = {} #initialize empty dictionary for the subject data
      #store subject variables
      model_dict[norm_type][subj_id]['model'] = clf
      model_dict[norm_type][subj_id]['X_train'] = X_train
      model_dict[norm_type][subj_id]['y_train'] = y_train
      model_dict[norm_type][subj_id]['data'] = scaled_data
    sub_scores,cols = get_accuracy_scores(clf,scaled_data,X_train,y_train,runs_test,binary_labels) #get accuracy scores for analysis
    norm_results.extend(sub_scores) #append to list
    results.append(norm_results) #append to results
df_columns.extend(cols) #extend data frame columns
results_df = pd.DataFrame(results,columns=df_columns) #create df
#save results for analysis
save_data_path = f'/content/drive/My Drive/data/dataexploration/{svc_kernel}_exploration/rtr_1_2_accuracy_results.csv'
results_df.to_csv(save_data_path)
#save models for visualization
save_models_path = f'/content/drive/My Drive/data/dataexploration/{svc_kernel}_exploration/rtr_1_2_norm_models.pkl'
filehandler = open(save_models_path,"wb")
pickle.dump(model_dict,filehandler)
filehandler.close()



100%|██████████| 4/4 [00:00<00:00,  4.14it/s]
  signals = (signals - mean_signal) / np.absolute(mean_signal)
100%|██████████| 4/4 [00:00<00:00,  4.77it/s]
100%|██████████| 4/4 [00:00<00:00,  4.71it/s]
  signals = (signals - mean_signal) / np.absolute(mean_signal)
100%|██████████| 4/4 [00:00<00:00,  4.91it/s]
  signals = (signals - mean_signal) / np.absolute(mean_signal)
100%|██████████| 4/4 [00:00<00:00,  5.79it/s]
  signals = (signals - mean_signal) / np.absolute(mean_signal)


In [None]:
cv_dict = {} #initialize the results dictionary
destination_path = '/content/drive/My Drive/data/dataexploration/cross_validation_results/cv_results.pkl' #where to save data
runs_train=['run_02'] #runs we want to train on
runs_list=[2,3,4] #runs we want to do cv on
runs_test=['run_02','run_03']
cv_params = {'C':[1, 5, 10,1000],'kernel':['linear', 'rbf'],'gamma': [.0001, .01, 'auto','scale']} #params we want to test
subject_ids,subj_paths = get_subj_information(data_path_dict) #get subject information
mask_labels_indices,binary_labels,label_type = get_labels(data_path_dict) #get labels
mask_dict = get_mask_data(data_path_dict,0) #get mask dictionary containing mask data
mask = make_mask(mask_dict['mask']) #mask we want to use in cv
norm_type = 'zscore' #which normalization we want
#iterate over subjects and perform cv single subject svm
for idx in range(len(subject_ids)):
  subj_id = subjs_id[idx] #get subject id
  subj_path = subjs_paths[idx] #get path to subject data
  subj_data = access_load_data(subj_path,True)  #load subject data
  masked_data = mask_subject_data(subj_data,mask,mask_labels_indices) #mask the data
  scaled_data = scale_data_single_subj(masked_data,runs_train,runs_test,norm=norm_type) #normalize the data
  clf,X_train,y_train = run_single_subject_svm(scaled_data,runs_train,binary_labels,do_cv=True,params=cv_params) #run cross validation
  cv_dict[subj_id] = {} #initialize subject dictionary
  cv_dict[subj_id]['model'] = clf #save cv model for further analysis
  cv_dict[subj_id]['X_train'] = X_train
  cv_dict[subj_id]['y_train'] = y_train
  cv_dict[subj_id]['run_03'] = scaled_data['run_03']
  cv_dict[subj_id]['run_04'] = scaled_data['run_04']
#save data
filehandler = open(destination_path,"wb")
pickle.dump(cv_dict,filehandler)
filehandler.close() 