<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/source/SVM_Whole_Brain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Whole Brain Support Vector Machine Training
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training all subjects
- SVM training per subject

### Mount Google Drive and clone repository
- open to source directory

In [1]:
from google.colab import drive
drive.mount('/content/gdrive') #, force_remount = True)

Mounted at /content/gdrive


In [2]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source
!ls


Cloning into 'teambrainiac'...
remote: Enumerating objects: 300, done.[K
remote: Counting objects: 100% (300/300), done.[K
remote: Compressing objects: 100% (171/171), done.[K
remote: Total 300 (delta 172), reused 242 (delta 117), pack-reused 0[K
Receiving objects: 100% (300/300), 57.13 MiB | 13.86 MiB/s, done.
Resolving deltas: 100% (172/172), done.
/content/teambrainiac/source
Access_Load_Data.ipynb		  Mat_to_Numpy.ipynb
All_subject_masked_labeled.ipynb  percent_signal_change.ipynb
data				  SVM.ipynb
__init__.py			  utils.py
Masking.ipynb			  Visualize_Data.ipynb


### Load path_config.py 
- we are already in source so we can just load this file without chanding directory

In [3]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving path_config.py to path_config.py
User uploaded file "path_config.py" with length 228 bytes


### Load the Whole brain normalized masked all subject 2d pickle file 
- go to Drive outside of this notebook
  - create a folder named data
  - upload 'whole_brain_all_norm_2d.pkl' - will take 5 hours but should remain on system without need to upload again
  - once uploaded, drag the file to the repo source/data directory

In [4]:
!ls

Access_Load_Data.ipynb		  path_config.py
All_subject_masked_labeled.ipynb  percent_signal_change.ipynb
data				  SVM.ipynb
__init__.py			  utils.py
Masking.ipynb			  Visualize_Data.ipynb
Mat_to_Numpy.ipynb


### Import libraries
- need to load path_config.py to source directory
- load 'whole_brain_all_norm_2d.pkl' to source/data directory

In [None]:
# Import libraries
!pip install boto3 nilearn
import pickle
from utils import *
from sklearn.svm import SVC
import numpy as np
import random

In [11]:
%%time 

#Load all subject data + labels
local_data_path = "data/single_sub_norm_2d.pkl"
#local_data_path = "data/all_data_dictionary.pkl" # unnormalized
#local_data_path = "data/whole_brain_all_norm_2d.pkl" # normalized
#local_data_path = "data/all_data_norm_dict.pkl" #normalized on subject run
all_data = open_pickle(local_data_path)


CPU times: user 59.7 ms, sys: 334 ms, total: 394 ms
Wall time: 391 ms


In [12]:
#open path dictionary file to get subject ids
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)
subject_ids = data_path_dict['subject_ID']

# Randomly shuffle ids for train test val splits
random.seed(42)
random.shuffle(subject_ids)

In [13]:
# get train, test val data
train_ids = subject_ids[:36]
val_ids = subject_ids[36:44]
test_ids = subject_ids[44:53]

print(len(train_ids))
print(len(val_ids))
print(len(test_ids))
len(subject_ids)

36
8
8


52

In [14]:
# Get train test val data 
X_train = []
y_train = []

X_val = []
y_val = []

X_test = []
y_test = []

# Get X data from dictionary
for id_ in train_ids:
    for matrix in all_data[id_]:
        #print(matrix)
        X_train.append(matrix)
        
        
# Get y label from dictioanry
for id_ in train_ids:
    for label in all_data[f"{id_}_rt_labels"]:
        #print(matrix)
        y_train.append(label)
        


# Get X data from dictionary
for id_ in val_ids:
    for matrix in all_data[id_]:
        #print(matrix)
        X_val.append(matrix)
        
        
# Get y label from dictioanry
for id_ in val_ids:
    for label in all_data[f"{id_}_rt_labels"]:
        #print(matrix)
        y_val.append(label)
        
    

# Get X data from dictionary
for id_ in test_ids:
    for matrix in all_data[id_]:
        #print(matrix)
        X_test.append(matrix)
        
        
# Get y label from dictioanry
for id_ in test_ids:
    for label in all_data[f"{id_}_rt_labels"]:
        #print(matrix)
        y_test.append(label)
        
      

In [15]:
#4 runs * 52 subjects = 208
print(f"length of Xtrain data: {len(X_train)} and length of ytrain data: {len(y_train)}")
print(f"length of Xval data: {len(X_val)} and length of yval data: {len(y_val)}")
print(f"length of Xtest data: {len(X_test)} and length of ytest data: {len(y_test)}")
#print(f"length of X data: {len(X)} and length of y data: {len(y)}")

length of Xtrain data: 4 and length of ytrain data: 4
length of Xval data: 0 and length of yval data: 0
length of Xtest data: 0 and length of ytest data: 0


In [16]:
X_train = np.array(X_train)
y_train = np.array(y_train)
print( "Xtrain data shape ", X_train.shape)
print( "ytrain data shape ", y_train.shape)


Xtrain data shape  (4, 84, 237979)
ytrain data shape  (4, 84)


In [17]:
X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)
print( "Xtrain data shape after concantenation ", X_train.shape)
print( "ytrain data shape after concantenation ", y_train.shape)

Xtrain data shape after concantenation  (336, 237979)
ytrain data shape after concantenation  (336,)


## Set up SVM Model

In [18]:
%%time
clf = SVC()
clf.fit(X_train, y_train)

CPU times: user 37.5 s, sys: 8.51 s, total: 46 s
Wall time: 6.46 s
