<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/source/SVM_Group_Whole_Brain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Group Whole Brain Support Vector Machine Training
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training per group (subject ids of '100XX-XXXXX' for child and '300XX-XXXXX' for Youth)

### Mount Google Drive and clone repository
- open to source directory

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount = True)

Mounted at /content/gdrive


In [2]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source
!ls


Cloning into 'teambrainiac'...
remote: Enumerating objects: 356, done.[K
remote: Counting objects: 100% (356/356), done.[K
remote: Compressing objects: 100% (212/212), done.[K
remote: Total 356 (delta 211), reused 262 (delta 130), pack-reused 0[K
Receiving objects: 100% (356/356), 57.15 MiB | 47.46 MiB/s, done.
Resolving deltas: 100% (211/211), done.
/content/teambrainiac/source
Access_Load_Data.ipynb		  percent_signal_change.ipynb
All_subject_masked_labeled.ipynb  SVM_Group_Whole_Brain.ipynb
data				  SVM.ipynb
__init__.py			  SVM_Whole_Brain.ipynb
Masking.ipynb			  utils.py
Mat_to_Numpy.ipynb		  Visualize_Data.ipynb
models


### Load path_config.py 
- we are already in source so we can just load this file without chanding directory

In [3]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving path_config.py to path_config.py
User uploaded file "path_config.py" with length 228 bytes


### Load the Whole brain normalized masked all subject 2d pickle file 


In [None]:
!ls

Access_Load_Data.ipynb		  path_config.py
All_subject_masked_labeled.ipynb  percent_signal_change.ipynb
data				  SVM.ipynb
__init__.py			  SVM_Whole_Brain.ipynb
Masking.ipynb			  utils.py
Mat_to_Numpy.ipynb		  Visualize_Data.ipynb


#### Import libraries


In [4]:
# Import libraries
!pip install boto3 nilearn
import pickle
from utils import *
from sklearn.svm import SVC
import numpy as np
import random
import cv2 as cv

Collecting boto3
  Downloading boto3-1.21.23-py3-none-any.whl (132 kB)
[?25l[K     |██▌                             | 10 kB 31.6 MB/s eta 0:00:01[K     |█████                           | 20 kB 33.5 MB/s eta 0:00:01[K     |███████▍                        | 30 kB 39.4 MB/s eta 0:00:01[K     |██████████                      | 40 kB 26.2 MB/s eta 0:00:01[K     |████████████▍                   | 51 kB 20.5 MB/s eta 0:00:01[K     |██████████████▉                 | 61 kB 23.3 MB/s eta 0:00:01[K     |█████████████████▍              | 71 kB 23.4 MB/s eta 0:00:01[K     |███████████████████▉            | 81 kB 24.9 MB/s eta 0:00:01[K     |██████████████████████▎         | 92 kB 27.0 MB/s eta 0:00:01[K     |████████████████████████▊       | 102 kB 25.9 MB/s eta 0:00:01[K     |███████████████████████████▎    | 112 kB 25.9 MB/s eta 0:00:01[K     |█████████████████████████████▊  | 122 kB 25.9 MB/s eta 0:00:01[K     |████████████████████████████████| 132 kB 25.9 MB/s 
[?25h

### Load Data from AWS and create Train/Val/Test splits

In [5]:
%%time
pkl_file = "whole_brain_all_norm_2d.pkl" # normalized
#pkl_file = 'all_data_dictionary.pkl' #Unnormalized
bool_mat = False
data = access_load_data(pkl_file, bool_mat)

CPU times: user 1min 2s, sys: 1min 15s, total: 2min 17s
Wall time: 2min 4s


In [6]:
 #open path dictionary file to get subject ids
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)
subject_ids = data_path_dict['subject_ID']



In [7]:
# Split dataset into young children and older children/young adults
child = subject_ids[:33]
teen_plus = subject_ids[33:]
print("Number of subjects for child and teens", len(child), len(teen_plus))

Number of subjects for child and teens 33 19


#### Get Child data Train, Test, Split

In [8]:
# get train, test val data
c_train_ids = child[:23]
c_val_ids = child[23:28]
c_test_ids = child[28:34]

print("Number of child training examples: ", len(c_train_ids))
print("Number of child validation examples: ", len(c_val_ids))
print("Number of child testing examples: ", len(c_test_ids))
len(child)

Number of child training examples:  23
Number of child validation examples:  5
Number of child testing examples:  5


33

In [9]:
# Get train test val data 
X_trainc = []
y_trainc = []

X_valc = []
y_valc = []

X_testc = []
y_testc = []

run = 1 # Get data from run #2

# Get X data from dictionary
for id_ in c_train_ids:
  X_trainc.append(data[id_][run])
        
        
# Get y label from dictioanry
for id_ in c_train_ids:
  y_trainc.append(data[f"{id_}_rt_labels"][run])
        


# Get X data from dictionary
for id_ in c_val_ids:
  X_valc.append(data[id_][run])
 

# Get y label from dictioanry
for id_ in c_val_ids:
  y_valc.append(data[f"{id_}_rt_labels"][run])
        
    

# Get X data from dictionary
for id_ in c_test_ids:
  X_testc.append(data[id_][run])
        
        
# Get y label from dictioanry
for id_ in c_test_ids:
  y_testc.append(data[f"{id_}_rt_labels"][run])

In [10]:
# 1 run * 33 * 84 = 2772 total time points
print(f"length of Child Xtrain data: {len(X_trainc)} and length of ytrain data: {len(y_trainc)}")
print(f"length of Child Xval data: {len(X_valc)} and length of yval data: {len(y_valc)}")
print(f"length of Child Xtest data: {len(X_testc)} and length of ytest data: {len(y_testc)}")
#print(f"length of X data: {len(X)} and length of y data: {len(y)}")

length of Child Xtrain data: 23 and length of ytrain data: 23
length of Child Xval data: 5 and length of yval data: 5
length of Child Xtest data: 5 and length of ytest data: 5


In [11]:
%%time
X_trainc = np.array(X_trainc)
y_trainc = np.array(y_trainc)
print( "Child Xtrain data shape ", X_trainc.shape)
print( "Child ytrain data shape ", y_trainc.shape)

X_valc = np.array(X_valc)
y_valc = np.array(y_valc)
print( "Child Xval data shape ", X_valc.shape)
print( "Child yval data shape ", y_valc.shape)

X_testc = np.array(X_testc)
y_testc = np.array(y_testc)
print( "Child Xtest data shape ", X_testc.shape)
print( "Child ytest data shape ", y_testc.shape)

Child Xtrain data shape  (23, 84, 237979)
Child ytrain data shape  (23, 84)
Child Xval data shape  (5, 84, 237979)
Child yval data shape  (5, 84)
Child Xtest data shape  (5, 84, 237979)
Child ytest data shape  (5, 84)
CPU times: user 785 ms, sys: 708 ms, total: 1.49 s
Wall time: 1.48 s


In [12]:
%%time
X_trainc = np.concatenate(X_trainc)
y_trainc = np.concatenate(y_trainc)
print( "Child Xtrain data shape after concantenation ", X_trainc.shape)
print( "Child ytrain data shape after concantenation ", y_trainc.shape)

X_valc = np.concatenate(X_valc)
y_valc = np.concatenate(y_valc)
print( "Child Xval data shape after concantenation ", X_valc.shape)
print( "Child yval data shape after concantenation ", y_valc.shape)

X_testc = np.concatenate(X_testc)
y_testc = np.concatenate(y_testc)
print( "Child Xtest data shape after concantenation ", X_testc.shape)
print( "Child ytest data shape after concantenation ", y_testc.shape)

Child Xtrain data shape after concantenation  (1932, 237979)
Child ytrain data shape after concantenation  (1932,)
Child Xval data shape after concantenation  (420, 237979)
Child yval data shape after concantenation  (420,)
Child Xtest data shape after concantenation  (420, 237979)
Child ytest data shape after concantenation  (420,)
CPU times: user 551 ms, sys: 442 ms, total: 993 ms
Wall time: 983 ms


#### Get Teen+ data Train, Test, Split

In [40]:
# get train, test val data
y_train_ids = teen_plus[:13]
y_val_ids = teen_plus[13:16]
y_test_ids = teen_plus[16:20]

print("Number of Youth training examples: ", len(y_train_ids))
print("Number of Youth validation examples: ", len(y_val_ids))
print("Number of Youth testing examples: ", len(y_test_ids))
len(teen_plus)

Number of Youth training examples:  13
Number of Youth validation examples:  3
Number of Youth testing examples:  3


19

In [41]:
# Get train test val data 
X_trainy = []
y_trainy = []

X_valy = []
y_valy = []

X_testy = []
y_testy = []

run = 1 # get run #2

# Get X data from dictionary
for id_ in y_train_ids:
    for matrix in data[id_][run]:
        #print(matrix)
        X_trainy.append(matrix)
        
        
# Get y label from dictioanry
for id_ in y_train_ids:
    for label in data[f"{id_}_rt_labels"][run]:
        #print(matrix)
        y_trainy.append(label)
        


# Get X data from dictionary
for id_ in y_val_ids:
    for matrix in data[id_][run]:
        #print(matrix)
        X_valy.append(matrix)
        
        
# Get y label from dictioanry
for id_ in y_val_ids:
    for label in data[f"{id_}_rt_labels"][run]:
        #print(matrix)
        y_valy.append(label)
        
    

# Get X data from dictionary
for id_ in y_test_ids:
    for matrix in data[id_][run]:
        #print(matrix)
        X_testy.append(matrix)
        
        
# Get y label from dictioanry
for id_ in y_test_ids:
    for label in data[f"{id_}_rt_labels"][run]:
        #print(matrix)
        y_testy.append(label)
        
      

In [42]:
# 1 * 19 * 84 = 1596 time points total
print(f"length of Youth Xtrain data: {len(X_trainy)} and length of ytrain data: {len(y_trainy)}")
print(f"length of Youth Xval data: {len(X_valy)} and length of yval data: {len(y_valy)}")
print(f"length of Youth Xtest data: {len(X_testy)} and length of ytest data: {len(y_testy)}")
#print(f"length of X data: {len(X)} and length of y data: {len(y)}")

length of Youth Xtrain data: 1092 and length of ytrain data: 1092
length of Youth Xval data: 252 and length of yval data: 252
length of Youth Xtest data: 252 and length of ytest data: 252


In [47]:
%%time
X_trainy = np.array(X_trainy)
y_trainy = np.array(y_trainy)
print( "Youth Xtrain data shape ", X_trainy.shape)
print( "Youth ytrain data shape ", y_trainy.shape)

X_valy = np.array(X_valy)
y_valy = np.array(y_valy)
print( "Youth Xval data shape ", X_valy.shape)
print( "Youth yval data shape ", y_valy.shape)

X_testy = np.array(X_testy)
y_testy = np.array(y_testy)
print( "Youth Xtest data shape ", X_testy.shape)
print( "Youth ytest data shape ", y_testy.shape)

Youth Xtrain data shape  (1092, 237979)
Youth ytrain data shape  (1092,)
Youth Xval data shape  (252, 237979)
Youth yval data shape  (252,)
Youth Xtest data shape  (252, 237979)
Youth ytest data shape  (252,)
CPU times: user 748 ms, sys: 0 ns, total: 748 ms
Wall time: 743 ms


## Set up SVM Model

#### SKlearn model training on Group data for Child

In [22]:
%%time

model_name = "group_child_svm_10000"
clf = SVC(max_iter = 10000, random_state = 42)
clf.fit(X_trainc, y_trainc)
f = open("models/%s.pkl"%model_name,"wb")
pickle.dump(clf, f)
f.close()

CPU times: user 22min 16s, sys: 5min 54s, total: 28min 11s
Wall time: 3min 56s


In [23]:
%%time
yval_pred = clf.predict(X_valc)


CPU times: user 17min 36s, sys: 44min 21s, total: 1h 1min 58s
Wall time: 7min 49s


In [24]:
from sklearn.metrics import accuracy_score

# Model Accuracy
print("Accuracy:", accuracy_score(y_valc, yval_pred))

Accuracy: 0.5857142857142857


In [25]:
%%time
ytest_pred = clf.predict(X_testc)

# Model Accuracy
print("Accuracy:", accuracy_score(y_testc, ytest_pred))

Accuracy: 0.5571428571428572
CPU times: user 17min 48s, sys: 45min 2s, total: 1h 2min 51s
Wall time: 7min 58s


#### LibSVM

#### OpenCV library
- https://docs.opencv.org/3.4/d1/d73/tutorial_introduction_to_svm.html


In [None]:
%%time

# Train the SVM using openCV
svm = cv.ml.SVM_create()
svm.setType(cv.ml.SVM_C_SVC)
svm.setKernel(cv.ml.SVM_LINEAR)
svm.setTermCriteria((cv.TERM_CRITERIA_MAX_ITER, 100, 1e-6))
svm.train(X_train, cv.ml.ROW_SAMPLE, y_train)
