<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/source/DataExploration_SingleSubj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DataExploration for Single Subject SVM
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training per subject

### Mount Google Drive and clone repository
- open to source directory

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount = True)

Mounted at /content/gdrive


In [2]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [6]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source
!ls


Cloning into 'teambrainiac'...
remote: Enumerating objects: 784, done.[K
remote: Counting objects: 100% (784/784), done.[K
remote: Compressing objects: 100% (574/574), done.[K
remote: Total 784 (delta 489), reused 392 (delta 194), pack-reused 0[K
Receiving objects: 100% (784/784), 73.27 MiB | 31.48 MiB/s, done.
Resolving deltas: 100% (489/489), done.
/content/teambrainiac/source
access_data.py			  models
AccuracyMeasures.ipynb		  process.py
analysis.py			  SingleSubjectSVM.ipynb
cross_validation.py		  SubjectVisualization_Models_ZNORM.ipynb
data				  SVM_Group_Adolescent_Whole_brain.ipynb
DataExploration_SingleSubj.ipynb  SVM_Group_YA_Whole_brain.ipynb
DL				  TestMask.ipynb
Explore_data.ipynb		  train.py
Group_All_MASK_SVM.ipynb	  utils.py
helper				  VisualizationPlayground.ipynb
Images				  Visualize_Data.ipynb
__init__.py			  visualize.py


### Load path_config.py 
- we are already in source so we can just load this file without changing directory

In [3]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving path_config.py to path_config.py
User uploaded file "path_config.py" with length 228 bytes


### Import libraries


In [7]:

# Import libraries
!pip install boto3 nilearn nibabel #for saving data
import pickle
#sklearn packages needed
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, auc, recall_score, precision_score,roc_curve,f1_score
#important utility functions for loading,masking,saving data
from utils import *
#normal python packages we use
import numpy as np
import pandas as pd




### Get paths to subject data and grab labels for SVM

In [8]:

## load and open the pickle file that contains paths to all data.
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)

###Functions to get information about data to run our SVM

In [9]:
def get_subj_information(data_path_dict):
  """
    Function to get subject information.
    Params:
      data_path_dict  : dictionary containing paths to all data stored on AWS
    returns:  subject_ids(list of subjects to run),subj_paths(paths to subject raw data)
  """
  subject_ids = data_path_dict['subject_ID'] #subject_ids
  subj_paths = data_path_dict['subject_data'] #subject_paths
  return subject_ids,subj_paths

def get_labels(data_path_dict):
  """
    Function to get the labels for our data.
    Params:
      data_path_dict  : dictionary containing paths to all data stored on AWS
    returns: mask_labels_indices(timepoints we want masked out),binary_labels(labels for our for our two brain states)
             and label_type
  """
  
  label_data_path = data_path_dict['labels'][0] #get labels
  label_type = 'rt_labels' #tell the function what labels we want
  mask_labels_indices, binary_labels = labels_mask_binary(label_data_path, label_type) #grab indices and labels
  return mask_labels_indices, binary_labels,label_type

def get_mask(mask_type,data_path_dict,mask_ind):
  """
    Function to return the mask of what brain voxels we want to include in analysis
    Params:
      data_path_dict  : dictionary containing paths to all data stored on AWS
      mask_type: name of mask we want to use
      mask_ind: index of where the path to the masks are 0: full brain mask plus masks that subtract region
              1: Regions of interest(ROIs) mask out full brain except structure we care about
  """
  mask_data_filepath = data_path_dict['mask_data'][mask_ind] #path to masked data     
  mask_type_dict = access_load_data(mask_data_filepath, True) #get the mask data dictionary
  np_array_mask = mask_type_dict[mask_type] #get the mask array
  mask = np.ma.make_mask(np_array_mask).reshape(79*95*79,order='F') #create a 1-D array for the mask. Important to use Fourier Transformation as we are working in brain space!

  return mask

In [None]:
mask_data_filepath = data_path_dict['mask_data'][0] #path to masked data  
mask_type_dict = access_load_data(mask_data_filepath, True)
mask_type_dict.keys()

dict_keys(['__header__', '__version__', '__globals__', 'mask', 'masksubACC', 'masksubAI', 'masksubNAcc', 'masksubmPFC'])

## Set up SVM Model

In [10]:
def load_subject_data(subj_paths,subject_ids,idx,dopsc,mask,mask_labels_indices,binary_labels,label_type):
  """
    Function to load indivdual subject data. Calls masking data which is in utils.py
    Params:
      subj_paths  :paths to subject data
      subject_ids :list of subjects
      idx   : index of current subject
      dopsc : If True, do percent signal change before returning user_data_dict
      mask  : mask to use on subject
      mask_labels_indices : indices of brain volumes we want to keep (aligning to increase/decrease)
      binary_labels: labels of increase/decrease to append to user dictionary
      label_type: type of label we are applying
    returns: user data dictionary that contains all four runs masked to just keep voxels of interest along with labels
               along with the subjects id
  """
  mat_path = subj_paths[idx] #get raw data path
  sub_id = subject_ids[idx] #get subject id
  data = access_load_data(mat_path,True) #call function to load data and return a dictionary. We are loading .mat files so need to set second param to True
  user_data_dict = {} #create empty dict
  user_data_dict[sub_id], bi_lb = masking_data(data, mask, mask_labels_indices, binary_labels, dopsc) #call function to mask the data and do normalization if desired
  user_data_dict[f"{sub_id}_{label_type}"] = bi_lb #set the binary labels for our data (0 = decrease,1=increase)
  return user_data_dict,sub_id

In [11]:
def scale_data_single_subj(sub_data,sub_labels, runs_train,runs_test,norm):
  """
    Function to scale data. Flexible to accomadate multiple different schemas to test which normalization is best.
      sub_data     : (1 subject data, keys as subject ID for frmi data or labels)
      sub_labels   : sub_labels to indicate which row of the sub_data belongs to increase/decrease state
      runs_train   : tuple , (which run are we using for the training data)
                            If runs_test = 1, there will be no X_val, y_val
      runs_test    : tuple, (which run are we using for the test data)
      norm         : string, ("RUNS": normalizing separately on each run;
                              "SUBJECT": Normalizing separately by each subject)
      returns      : nd.arrays, Concatenated X data of (time points, x*y*z) x = 79, y = 95, z = 75
                    and Concatenated y labels of (time points,)
    """

  # train and labels
  X = []
  y = []
  ##val and labels
  Xv = []
  yv = []
  # TEST and labels
  Xt = []
  yt = []
  ## decide how to concatenate runs
  if len(runs_train)>1:
      for run in runs_train:
        X.append(sub_data[run])
        y.append(sub_labels[run])
      
      X = np.concatenate(np.array(X))
      y = np.concatenate(np.array(y))
  else:
      X = sub_data[runs_train[0]-1]
      y = sub_labels[runs_train[0]-1]
  if len(runs_test)> 1:
      Xv = sub_data[runs_test[0]-1]
      yv = sub_labels[runs_test[0]-1]
      Xt = sub_data[runs_test[1]-1]
      yt = sub_labels[runs_test[1]-1]
  else:
      Xt = sub_data[runs_test[0]-1]
      yt = sub_labels[runs_test[0]-1]
  ##run standardization
  if norm == "RUNS":
      scalar = StandardScaler()
      X = scalar.fit_transform(X)
      scalarT = StandardScaler()
      Xt = scalarT.fit_transform(Xt)
      if len(Xv)>0:
        Xv = scalarT.fit_transform(Xv)
  elif norm == "SUBJECT":
      scalar = StandardScaler().fit(X)
      X = scalar.transform(X)
      

      Xt = scalar.transform(Xt)
      if len(Xv)>0:
        Xv = scalar.transform(Xv)
  else:
      print('Not doing standardization')      
  return X, y, Xt, yt, Xv, yv

In [12]:
def run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm="none",do_cv=False):
  """
    Function to run cross-validation or single subject SVM
      Params:
        sub_data     : (1 subject data, keys as subject ID for frmi data or labels)
        sub_labels   : sub_labels to indicate which row of the sub_data belongs to increase/decrease state
        runs_train   : tuple , (which run are we using for the training data)
                              If runs_test = 1, there will be no X_val, y_val
        runs_test    : tuple, (which run are we using for the test data)
        norm         : string, ("RUNS": normalizing separately on each run;
                                "SUBJECT": Normalizing separately by each subject
                                "none: no normalization will be done)
        do_cv:       : If True, run cross-validation
      returns: subject individual model, Training data/labels, Val data/labels, Test data/labels
  """
  
  #get scaled data
  X_train, y_train, X_test, y_test, X_val, y_val = scale_data_single_subj(sub_data,sub_labels,runs_train,runs_test,norm)
  #run cv if do_cv = True, else run individual model SVM
  if do_cv:
    c_params = {'C':[0.7, 1, 5, 10],'kernel':['linear', 'rbf']}
    svc = SVC()
    clf = GridSearchCV(svc, c_params)
    clf.fit(X_train, y_train)
    return clf
  else:
    clf = SVC(C=10)
    clf.fit(X_train,y_train)

  return clf,X_train,y_train,X_test,y_test,X_val,y_val

In [13]:
def get_accuracy_scores(model_dict,subj,normalization_by,normalization_type):
  """
    Function to get accuracy scores for subject models.
    Params:
      model_dict: contains subject model and training/test/val/data
      subj: subject name 
      normalization_by: options:'RUNS','SUBJECT','none' how are we doing normalization
      normalization_type: options: 'PSC','ZNORM','none' what type of normalizatin
    returns: subj_list, list of subject metrics
  """
  clf = model_dict[subj]['model'] #get subject model
  X_train = model_dict[subj]['X_train'] #get train data
  y_train = model_dict[subj]['y_train'] #get train labels
  X_test = model_dict[subj]['X_test']  #get test data
  y_test = model_dict[subj]['y_test'] #get test labels
  if 'X_val' in model_dict[subj].keys(): #if validation data
    X_val = model_dict[subj]['X_val'] #get val data
    y_val = model_dict[subj]['y_val'] #get val labels
    y_val_predicts = clf.predict(X_val) #predict val
    val_acc = accuracy_score(y_val,y_val_predicts) #get acc
    val_precision = precision_score(y_val,y_val_predicts) #get precision
    val_recall = recall_score(y_val,y_val_predicts) #get recall
    val_f1 = f1_score(y_val,y_val_predicts) #get f1
  else:
    y_val_predicts = 0 #empty if val doesn't exist
    val_acc = 0
  train_predicts = clf.predict(X_train) #predict train
  y_test_predicts = clf.predict(X_test) #predict test
  tr_acc = accuracy_score(y_train,train_predicts)   #train acc
  test_acc = accuracy_score(y_test,y_test_predicts) #test acc
  test_precision = precision_score(y_test,y_test_predicts) #get precision
  test_recall = recall_score(y_test,y_test_predicts) #get recall
  test_f1 = f1_score(y_test,y_test_predicts) #get f1
  subj_list = [subj,tr_acc,val_acc,val_precision,val_recall,val_f1, test_acc,test_precision,test_recall,test_f1,normalization_by,normalization_type] #add metrics to subj_list
  return subj_list

In [14]:

def run_subject_model(data_path_dict,path,file_name,runs_train,runs_test,mask_type,mask_ind,do_psc=True,norm='none',normtype='none'):
  """
    Function loops over subjects to grap data,scale,data,and run the SVM.
      data_path_dict  : dictionary containing paths to all data stored on AWS
      path : path to save pickle files. NOTE: If run on all subjects, this will take up about 20 GB
      file_name : name of file to prepend to subject id
      runs_train: what runs do we want to train on
      runs_test: what runs do we want to test on
      mask_type: which type of brain mask do we want to apply
      mask_ind: to distinguish between ROI regions of masking(1) and full brain(or full_brain minus ROIs)(0)
      do_psc: do we want to apply Percent Signal Change normalization. Default = True,
      norm: Takes three parameters: none(no normalization will be applied),"SUBJECT"(apply normalization per subject)
            "RUNS"(apply normalization per run separately)
  """
  pic_list = ['10047_09030','30017_09567'] #who to save pickle file for
  pic_files = [] #empty list to add path of pickle file
  #get subject information
  sub_acc = []
  subject_ids,subj_paths = get_subj_information(data_path_dict)
  #get mask labels to only retrieve time series we care about
  mask_labels_indices,binary_labels,label_type = get_labels(data_path_dict)
  #get mask to mask out voxels we don't need
  mask = get_mask(mask_type,data_path_dict,mask_ind)
  #loop over subjects
  for idx in range(len(subject_ids)):
    model_dict = {} #create empty model dictionary per subject 
    #get raw run data for a subject and their subject id
    user_data_dict, sub_id = load_subject_data(subj_paths,subject_ids,idx,do_psc,mask,mask_labels_indices,binary_labels,label_type)
    sub_data = user_data_dict[sub_id] #get raw data
    sub_labels = user_data_dict[f"{sub_id}_rt_labels"] #get labels (these were labels that were already masked)
    #run single subject SVM and get data
    clf,X_train, y_train, X_test, y_test, X_val, y_val = run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm)
    model_dict[sub_id] = {} #create empty dictionary under subject
    #add data to dictionary
    model_dict[sub_id]['model'] = clf
    model_dict[sub_id]['X_train'] = X_train
    model_dict[sub_id]['y_train'] = y_train
    model_dict[sub_id]['X_test'] = X_test
    model_dict[sub_id]['y_test'] = y_test
    #add validation data if collected
    if len(X_val)>0:
      model_dict[sub_id]['X_val'] = X_val
      model_dict[sub_id]['y_val'] = y_val
    #get accuracy scores
    sub_accuracy = get_accuracy_scores(model_dict,sub_id,norm,normtype)
    sub_acc.append(sub_accuracy) #append to master subject accuracy list
    #only save two pickle files for visualization
    if sub_id in pic_list:
      sub_destination_path = f'{path}{file_name}_{sub_id}.pkl' #path to save file
      pic_files.append(sub_destination_path) #append to pic_files
      #save file
      filehandler = open(sub_destination_path,"wb")
      pickle.dump(model_dict,filehandler)
      filehandler.close()
    
  #   single_model_path[sub_id] = destination_path
  #   object_name = destination_path
  #   upload = s3_upload(model_dict,object_name,"pickle")
  # object_name = f'{path}{mask_type}_data_path_dict.pkl'
  # upload = s3_upload(model_dict,object_name,"pickle")
  destination_path = f'{path}{file_name}.csv' #path for metrics
  #create dataframe of all metrics
  sub_acc_df = pd.DataFrame(sub_acc, columns = ["Subject","TrainAcc","ValAcc","ValAUC","ValPrecision","ValRecall","ValF1",
                                                 "TestAcc","TestAUC","TestPrecision","TestRecall","TestF1","NormBy","NormType"])
  sub_acc_df.to_csv(destination_path) #save data frame
  return destination_path,pic_files

In [None]:
csv_paths = [] #create empty path list

In [None]:
###cell to run single svm pipeline
save_data_path = '/content/drive/My Drive/data/dataexploration/' ##file path to save data
file_name = 'NoNORM_run_2_t_3_4' ## name of file to save
file_path,sub_files = run_subject_model(data_path_dict,save_data_path,file_name,(2,),(3,4),'mask',0,do_psc=False,norm='none',normtype='NONORM') #run model
csv_paths.append([file_path]) #add to path list
csv_paths.append(sub_files) #add to path list

In [None]:
csv_path_df = pd.DataFrame(csv_paths,columns=['metric_scores','pickle_files']) #make path df 
csv_path_df.to_csv(f'{save_data_path}paths_to_metrics_pickle.csv') # save file

In [23]:
def run_subject_model_cv(data_path_dict, destination_path, runs_train, runs_test, mask_type, mask_ind, do_psc=True, norm='none'):
  """
    Function to run cross validation for data. Note: takes 2 hours to run on all subjects.
    Params:
      data_path_dict  : dictionary containing paths to all data stored on AWS
      path : path to save pickle files. NOTE: If run on all subjects, this will take up about 20 GB
      file_name : name of file to prepend to subject id
      runs_train: what runs do we want to train on
      runs_test: what runs do we want to test on
      mask_type: which type of brain mask do we want to apply
      mask_ind: to distinguish between ROI regions of masking(1) and full brain(or full_brain minus ROIs)(0)
      do_psc: do we want to apply Percent Signal Change normalization. Default = True,
      norm: Takes three parameters: none(no normalization will be applied),"SUBJECT"(apply normalization per subject)
            "RUNS"(apply normalization per run separately)
  Returns: cv_dict, a dictionary that holds cv results
  """

  cv_dict = {}
  subject_ids,subj_paths = get_subj_information(data_path_dict)
  subject_ids,subj_paths = get_subj_information(data_path_dict)
  mask_labels_indices,binary_labels,label_type = get_labels(data_path_dict)
  mask = get_mask(mask_type,data_path_dict,mask_ind)
  for idx in range(len(subject_ids)):
    model_dict = {}
    user_data_dict, sub_id = load_subject_data(subj_paths,subject_ids,idx,do_psc,mask,mask_labels_indices,binary_labels,label_type)
    sub_data = user_data_dict[sub_id]
    sub_labels = user_data_dict[f"{sub_id}_rt_labels"]
    clf = run_single_subject_svm(sub_data,sub_labels,runs_train,runs_test,norm,do_cv=True)
    cv_dict[sub_id] = {}
    cv_dict[sub_id]['model'] = clf
  filehandler = open(destination_path,"wb")
  pickle.dump(cv_dict,filehandler)
  filehandler.close() 
  return cv_dict

In [None]:
save_data_path = '/content/drive/My Drive/data/dataexploration/'
cv_results = run_subject_model_cv(data_path_dict, save_data_path, (2,), (3,4), 'mask', 0, do_psc=False, norm='RUNS')

100%|██████████| 4/4 [00:00<00:00,  5.03it/s]
100%|██████████| 4/4 [00:00<00:00,  5.23it/s]
100%|██████████| 4/4 [00:00<00:00,  4.52it/s]
100%|██████████| 4/4 [00:00<00:00,  4.41it/s]
100%|██████████| 4/4 [00:01<00:00,  3.74it/s]
100%|██████████| 4/4 [00:00<00:00,  4.16it/s]
100%|██████████| 4/4 [00:01<00:00,  3.78it/s]
100%|██████████| 4/4 [00:00<00:00,  4.52it/s]
100%|██████████| 4/4 [00:01<00:00,  3.71it/s]
100%|██████████| 4/4 [00:00<00:00,  5.33it/s]
100%|██████████| 4/4 [00:00<00:00,  4.45it/s]
100%|██████████| 4/4 [00:00<00:00,  5.21it/s]
100%|██████████| 4/4 [00:00<00:00,  5.12it/s]
100%|██████████| 4/4 [00:00<00:00,  4.81it/s]
100%|██████████| 4/4 [00:00<00:00,  4.72it/s]
100%|██████████| 4/4 [00:00<00:00,  5.17it/s]
100%|██████████| 4/4 [00:00<00:00,  4.94it/s]
100%|██████████| 4/4 [00:00<00:00,  4.90it/s]
100%|██████████| 4/4 [00:00<00:00,  4.87it/s]
100%|██████████| 4/4 [00:01<00:00,  3.73it/s]
100%|██████████| 4/4 [00:00<00:00,  4.26it/s]
100%|██████████| 4/4 [00:00<00:00,