<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/source/Group_All_MASK_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Group Masked Brain Support Vector Machine Training
## Young Adult and Adolescent 
### whole brain, subACC, subAI, N.Accumbens, Prefrontal Cortex
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training per group (subject ids of '100XX-XXXXX' for Adolescent (child) and '300XX-XXXXX' for Young Adult)

### Mount Google Drive and clone repository
- open to source directory

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')#, force_remount = True)

Mounted at /content/gdrive


In [2]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source
!ls


Cloning into 'teambrainiac'...
remote: Enumerating objects: 534, done.[K
remote: Counting objects: 100% (534/534), done.[K
remote: Compressing objects: 100% (365/365), done.[K
remote: Total 534 (delta 321), reused 322 (delta 155), pack-reused 0[K
Receiving objects: 100% (534/534), 60.88 MiB | 13.35 MiB/s, done.
Resolving deltas: 100% (321/321), done.
/content/teambrainiac/source
Access_Load_Data.ipynb		  Mat_to_Numpy.ipynb
AccuracyMeasures.ipynb		  models
All_subject_masked_labeled.ipynb  percent_signal_change.ipynb
cross_validation.py		  process.py
data				  SingleSubjectSVM.ipynb
Explore_data.ipynb		  SVM_Group_Child_Whole_Brain.ipynb
explore.py			  SVM_Group_YA_Whole_brain.ipynb
Images				  utils.py
__init__.py			  Visualize_Data.ipynb
Masking.ipynb


### Load path_config.py 
- we are already in source so we can just load this file without chanding directory

In [3]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving path_config.py to path_config.py
User uploaded file "path_config.py" with length 228 bytes


#### Import libraries


In [4]:
# Import libraries
!pip install boto3 nilearn
import pickle
from utils import *
from process import *
from cross_validation import *
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
import tqdm


from botocore.exceptions import ClientError
from tempfile import TemporaryFile
from collections import defaultdict

Collecting boto3
  Downloading boto3-1.21.33-py3-none-any.whl (132 kB)
[?25l[K     |██▌                             | 10 kB 23.9 MB/s eta 0:00:01[K     |█████                           | 20 kB 9.7 MB/s eta 0:00:01[K     |███████▍                        | 30 kB 7.9 MB/s eta 0:00:01[K     |██████████                      | 40 kB 3.5 MB/s eta 0:00:01[K     |████████████▍                   | 51 kB 3.5 MB/s eta 0:00:01[K     |██████████████▉                 | 61 kB 4.2 MB/s eta 0:00:01[K     |█████████████████▎              | 71 kB 4.5 MB/s eta 0:00:01[K     |███████████████████▉            | 81 kB 4.5 MB/s eta 0:00:01[K     |██████████████████████▎         | 92 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████▊       | 102 kB 4.2 MB/s eta 0:00:01[K     |███████████████████████████▏    | 112 kB 4.2 MB/s eta 0:00:01[K     |█████████████████████████████▊  | 122 kB 4.2 MB/s eta 0:00:01[K     |████████████████████████████████| 132 kB 4.2 MB/s 
[?25hCollecting n

# Young Adults
- Create function to upload classifers to AWS
- Create function to transform the data into train, validation, test sets that are z-score normalized and concatenated to be ready for SVM
- Perform SVM()

In [22]:
# modified from S3 boto3 documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html

def s3_upload(data, object_name, data_type):
    """Upload a file to an S3 bucket

    :param data_file: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then data_file is used
    :return: True if file was uploaded, else False
    """

    # Upload the file
    # Connect to AWS client
    pubkey = mat_path['ACCESS_KEY']
    seckey = mat_path['SECRET_KEY']
    client = boto3.client('s3', aws_access_key_id = pubkey, aws_secret_access_key =seckey)
    s3 = boto3.resource('s3', aws_access_key_id = pubkey, aws_secret_access_key = seckey)
    
    # Grab bucket name
    bucket = s3.Bucket('teambrainiac')
    bucket_name = bucket.name  # 'teambrainiac'
    try:
        
        with tempfile.NamedTemporaryFile(delete=False) as temp:
          if data_type == "pickle":
            pickle.dump(data, temp)

          elif data_type == "numpy":
            np.save(temp, data)
            _ = temp.seek(0)
          elif data_type == "csv":
            data.to_csv(temp, index=False)

        client.upload_file(temp.name, bucket_name, object_name)
        temp.close()
        print(f"upload complete for {object_name}")

    except ClientError as e:
        logging.error(e)
        return False
    
    return True

In [12]:
def run_model(data, mask_type, group_sub_ids, runs_train, runs_val, runs_test, norm, svm_type):

  X, y, X_v, y_v, X_t, y_t = transform_data(data, group_sub_ids, runs_train, runs_val, runs_test, norm)

  model_name = f"{svm_type}_{runs_train}_{runs_val}_{runs_test}_{mask_type}"
  clf = SVC(C = 5.0 , class_weight= 'balanced', max_iter = 1000, random_state = 42) #probability = True
  print(f"Fitting the model for {mask_type}...")
  clf.fit(X, y)
  s3_upload(clf, "%s.pkl"%model_name, 'pickle')
 
  print("Predicting on Validation set...")
  yval_pred = clf.predict(X_v)
  val_acc = accuracy_score(y_v, yval_pred)
  print("Validation Accuracy:", val_acc)

  print("Predicting on Test set...")
  ytest_pred = clf.predict(X_t)
  test_acc = accuracy_score(y_t, ytest_pred)
  print("Test Accuracy:", test_acc)

  return val_acc, test_acc


## Run all Young Adult Train/Val/Test 
- Loop through all the masks
- fit to train
- upload clf to AWS
- calculate the accuracies on Validation and Test sets
- store accuracies in a dictionary, load in pandas and save as csv

In [18]:
file_name = [('whole_brain_all_norm_2d.pkl', "mask"),
            ('all_data_masksubACC_norm_2d.pkl', "subacc"),
            ('all_data_masksubAI_norm_2d.pkl', "subAI"),
            ('all_data_masksubNAcc_norm_2d.pkl', "Naccumb"),
            ('all_data_masksubmPFC_norm_2d.pkl', "PFC")
            ]

#open path dictionary file to get subject ids
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)
subject_ids = data_path_dict['subject_ID']


bool_mat = False
ya = subject_ids[33:]
print("Number of YA subjects: ", len(ya))
group_sub_ids = (ya, ya, ya) # This is not a joke
svm_type = "YA_svm_runs" #other: "Adolescent_svm_runs"
runs_train = 1 #train on run 2
runs_val = 2 #val on run 3
runs_test = 3 #test on run 4 (zero indexing)
norm = "RUNS"

acc_dict = defaultdict(list)

for pkl_file, mask_type in tqdm.tqdm(file_name):
  print("Running", pkl_file, mask_type)
  data = access_load_data(pkl_file, bool_mat)
  val_acc, test_acc = run_model(data, mask_type, group_sub_ids, runs_train, runs_val, runs_test, norm, svm_type)
  data = 0 #conserve RAM space
  acc_dict[f"{svm_type} Mask Type"].append(mask_type)
  acc_dict["Validation Accuracy"].append(val_acc)
  acc_dict['Test Accuracy'].append(test_acc)
  acc_dict['Normalize on'].append(norm)
  acc_dict['Train/Val/Test Runs'].append(f"run {runs_train}/run {runs_val}/run {runs_test}")


# Upload to S3
s3_upload(acc_dict, f"{svm_type}_acc_score.pkl", "pickle")

  0%|          | 0/5 [00:00<?, ?it/s]

Running whole_brain_all_norm_2d.pkl mask
Normalizing Each based on RUNS...
X train data shape after concantenation (84, 237979)
y train data shape after concantenation (84,)
X test data shape after concantenation (84, 237979)
y test data shape after concantenation (84,)
X val data shape after concantenation (84, 237979)
y val data shape after concantenation (84,)
Final X Train data shape (84, 237979)
Final y Train data shape  (84,)
Final X Val data shape (84, 237979)
Final y Val data shape  (84,)
Final X Test data shape (84, 237979)
Final y Test data shape  (84,)
Fitting the model for mask...
upload complete for YA_svm_runs_1_2_3_mask.pkl
Predicting on Validation set...
Validation Accuracy: 0.8571428571428571
Predicting on Test set...


 20%|██        | 1/5 [02:51<11:25, 171.41s/it]

Test Accuracy: 0.7619047619047619
Running all_data_masksubACC_norm_2d.pkl subacc
Normalizing Each based on RUNS...
X train data shape after concantenation (84, 235266)
y train data shape after concantenation (84,)
X test data shape after concantenation (84, 235266)
y test data shape after concantenation (84,)
X val data shape after concantenation (84, 235266)
y val data shape after concantenation (84,)
Final X Train data shape (84, 235266)
Final y Train data shape  (84,)
Final X Val data shape (84, 235266)
Final y Val data shape  (84,)
Final X Test data shape (84, 235266)
Final y Test data shape  (84,)
Fitting the model for subacc...
upload complete for YA_svm_runs_1_2_3_subacc.pkl
Predicting on Validation set...
Validation Accuracy: 0.8571428571428571
Predicting on Test set...


 40%|████      | 2/5 [05:54<08:55, 178.54s/it]

Test Accuracy: 0.7619047619047619
Running all_data_masksubAI_norm_2d.pkl subAI
Normalizing Each based on RUNS...
X train data shape after concantenation (84, 237722)
y train data shape after concantenation (84,)
X test data shape after concantenation (84, 237722)
y test data shape after concantenation (84,)
X val data shape after concantenation (84, 237722)
y val data shape after concantenation (84,)
Final X Train data shape (84, 237722)
Final y Train data shape  (84,)
Final X Val data shape (84, 237722)
Final y Val data shape  (84,)
Final X Test data shape (84, 237722)
Final y Test data shape  (84,)
Fitting the model for subAI...
upload complete for YA_svm_runs_1_2_3_subAI.pkl
Predicting on Validation set...
Validation Accuracy: 0.8571428571428571
Predicting on Test set...


 60%|██████    | 3/5 [08:58<06:01, 180.83s/it]

Test Accuracy: 0.7619047619047619
Running all_data_masksubNAcc_norm_2d.pkl Naccumb
Normalizing Each based on RUNS...
X train data shape after concantenation (84, 237252)
y train data shape after concantenation (84,)
X test data shape after concantenation (84, 237252)
y test data shape after concantenation (84,)
X val data shape after concantenation (84, 237252)
y val data shape after concantenation (84,)
Final X Train data shape (84, 237252)
Final y Train data shape  (84,)
Final X Val data shape (84, 237252)
Final y Val data shape  (84,)
Final X Test data shape (84, 237252)
Final y Test data shape  (84,)
Fitting the model for Naccumb...
upload complete for YA_svm_runs_1_2_3_Naccumb.pkl
Predicting on Validation set...
Validation Accuracy: 0.8571428571428571
Predicting on Test set...


 80%|████████  | 4/5 [12:16<03:07, 187.45s/it]

Test Accuracy: 0.7619047619047619
Running all_data_masksubmPFC_norm_2d.pkl PFC
Normalizing Each based on RUNS...
X train data shape after concantenation (84, 237722)
y train data shape after concantenation (84,)
X test data shape after concantenation (84, 237722)
y test data shape after concantenation (84,)
X val data shape after concantenation (84, 237722)
y val data shape after concantenation (84,)
Final X Train data shape (84, 237722)
Final y Train data shape  (84,)
Final X Val data shape (84, 237722)
Final y Val data shape  (84,)
Final X Test data shape (84, 237722)
Final y Test data shape  (84,)
Fitting the model for PFC...
upload complete for YA_svm_runs_1_2_3_PFC.pkl
Predicting on Validation set...
Validation Accuracy: 0.8571428571428571
Predicting on Test set...


100%|██████████| 5/5 [15:26<00:00, 185.36s/it]

Test Accuracy: 0.7619047619047619





In [None]:
access_load_data(f'{svm_type}_acc_score.csv', False)

## Run all Adolescent Train/Val/Test 
- Loop through all the masks
- fit to train
- upload clf to AWS
- calculate the accuracies on Validation and Test sets
- store accuracies in a dictionary, load in pandas and save as csv

In [None]:
file_name = [('whole_brain_all_norm_2d.pkl', "mask"),
            ('all_data_masksubACC_norm_2d.pkl', "subacc"),
            ('all_data_masksubAI_norm_2d.pkl', "subAI"),
            ('all_data_masksubNAcc_norm_2d.pkl', "Naccumb"),
            ('all_data_masksubmPFC_norm_2d.pkl', "PFC")
            ]

#open path dictionary file to get subject ids
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)
subject_ids = data_path_dict['subject_ID']


bool_mat = False
adol = subject_ids[:33]
print("Number of Adolescent subjects: ", len(adol))
group_sub_ids = (adol, adol, adol) 
svm_type = "Adolescent_svm_runs" #"YA_svm_runs" "Adolescent_svm_runs"
runs_train = 1 #train on run 2
runs_val = 2 #val on run 3
runs_test = 3 #test on run 4 (zero indexing)
norm = "RUNS"

acc_dict = defaultdict(list)

for pkl_file, mask_type in tqdm.tqdm(file_name):
  print("Running", pkl_file, mask_type)
  data = access_load_data(pkl_file, bool_mat)
  val_acc, test_acc = run_model(data, mask_type, group_sub_ids, runs_train, runs_val, runs_test, norm, svm_type)
  data = 0 #conserve RAM space
  acc_dict[f"{svm_type} Mask Type"].append(mask_type)
  acc_dict["Validation Accuracy"].append(val_acc)
  acc_dict['Test Accuracy'].append(test_acc)
  acc_dict['Normalize on'].append(norm)
  acc_dict['Train/Val/Test Runs'].append(f"run {runs_train}/run {runs_val}/run {runs_test}")


# Upload to S3
s3_upload(acc_dict, f"{svm_type}_acc_score.csv", "csv")

In [5]:
access_load_data(f'{svm_type}_acc_score.csv', False)


Unnamed: 0,YA_svm_runs Mask Type,Validation Accuracy,Test Accuracy,Normalize on,Train/Val/Test Runs
0,mask,0.857143,0.761905,RUNS,run 1/run 2/run 3
1,subacc,0.857143,0.761905,RUNS,run 1/run 2/run 3
2,subAI,0.857143,0.761905,RUNS,run 1/run 2/run 3
3,Naccumb,0.857143,0.761905,RUNS,run 1/run 2/run 3
4,PFC,0.857143,0.761905,RUNS,run 1/run 2/run 3


In [None]:
## Cross Validation accuracy scores recorded

dictionary = {"Mask Type Young Adult": ["Whole Brain","SubACC","SubAI","Nucleus Accumbens","Prefrontal Cortex"], 
              "Validation": [0.753968253968254, 0.753968253968254, 0.753968253968254, 0.7579365079365079, 0.7619047619047619],
              "Test": [0.800125313283208, 0.8020050125313283 , 0.7969924812030075, 0.7994987468671679, 0.7976190476190477]}

df = pd.DataFrame(dictionary)
df.set_index("Mask Type Young Adult", inplace = True)
df

Unnamed: 0_level_0,Validation,Test
Mask Type Young Adult,Unnamed: 1_level_1,Unnamed: 2_level_1
Whole Brain,0.753968,0.800125
SubACC,0.753968,0.802005
SubAI,0.753968,0.796992
Nucleus Accumbens,0.757937,0.799499
Prefrontal Cortex,0.761905,0.797619
