<a href="https://colab.research.google.com/github/yecatstevir/teambrainiac/blob/main/source/helper/Group_All_MASK_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Group Masked Brain Support Vector Machine Training
## Young Adult and Adolescent 
### whole brain, subACC, subAI, N.Accumbens, Prefrontal Cortex
- Go to 'Runtime' in Colab browser bar, select 'Change Runtime Type', select 'High-RAM' from 'Runtime Shape'. 
- load local pickle file containing all masked, normalized Whole Brain subject data in numpy matrix format
- SVM training per group (subject ids of '100XX-XXXXX' for Adolescent (child) and '300XX-XXXXX' for Young Adult)

### Mount Google Drive and clone repository
- open to source directory

In [None]:
from google.colab import drive, files
drive.mount('/content/gdrive')#, force_remount = True)

Mounted at /content/gdrive


In [None]:
# Clone the entire repo.
!git clone -l -s https://github.com/yecatstevir/teambrainiac.git
# Change directory into cloned repo
%cd teambrainiac/source/group_svm
!ls


Cloning into 'teambrainiac'...
remote: Enumerating objects: 534, done.[K
remote: Counting objects: 100% (534/534), done.[K
remote: Compressing objects: 100% (365/365), done.[K
remote: Total 534 (delta 321), reused 322 (delta 155), pack-reused 0[K
Receiving objects: 100% (534/534), 60.88 MiB | 13.35 MiB/s, done.
Resolving deltas: 100% (321/321), done.
/content/teambrainiac/source
Access_Load_Data.ipynb		  Mat_to_Numpy.ipynb
AccuracyMeasures.ipynb		  models
All_subject_masked_labeled.ipynb  percent_signal_change.ipynb
cross_validation.py		  process.py
data				  SingleSubjectSVM.ipynb
Explore_data.ipynb		  SVM_Group_Child_Whole_Brain.ipynb
explore.py			  SVM_Group_YA_Whole_brain.ipynb
Images				  utils.py
__init__.py			  Visualize_Data.ipynb
Masking.ipynb


### Load path_config.py 
- we are already in source so we can just load this file without chanding directory

In [None]:
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving path_config.py to path_config.py
User uploaded file "path_config.py" with length 228 bytes


#### Import libraries


In [None]:
# Import libraries
!pip install boto3 nilearn
import pickle
from utils import *
from process import *
from cross_validation import *
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
import tqdm


from botocore.exceptions import ClientError
from tempfile import TemporaryFile
from collections import defaultdict



# Young Adults
- Create function to upload classifers to AWS
- Create function to transform the data into train, validation, test sets that are z-score normalized and concatenated to be ready for SVM
- Perform SVM()

In [None]:
# modified from S3 boto3 documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html

def s3_upload(data, object_name, data_type):
    """Upload a file to an S3 bucket
    :param data: our data to upload
    :param data_type: type of data file we are creating
    :param object_name: S3 object name. If not specified then name of temp.name is used
    :return: True if file was uploaded, else False
    """

    # Upload the file
    # Connect to AWS client
    pubkey = mat_path['ACCESS_KEY']
    seckey = mat_path['SECRET_KEY']
    client = boto3.client('s3', aws_access_key_id=pubkey, aws_secret_access_key=seckey)
    s3 = boto3.resource('s3', aws_access_key_id=pubkey, aws_secret_access_key=seckey)

    # Grab bucket name
    bucket = s3.Bucket('teambrainiac')
    bucket_name = bucket.name  # 'teambrainiac'
    try:

        with tempfile.NamedTemporaryFile(delete=False) as temp:
            if data_type == "pickle":
                pickle.dump(data, temp)

            elif data_type == "numpy":
                np.save(temp, data)
                _ = temp.seek(0)

            elif data_type == "csv":
                data.to_csv(temp, index=False)
                
            client.upload_file(temp.name, bucket_name, object_name)
            temp.close()
            print(f"upload complete for {object_name}")
                
        if data_type == "nifti":
            tempf = 'data/upload_temp.nii'
            nib.save(data, tempf)
            client.upload_file(tempf, bucket_name, object_name)
            print(f"upload complete for {object_name}")
            
    except ClientError as e:
        logging.error(e)
        return False

    return True

In [None]:
def run_grp_svm_model(data, mask_type, group_sub_ids, runs_train, runs_val, runs_test, norm, svm_type):

  X, y, X_v, y_v, X_t, y_t = transform_data(data, group_sub_ids, runs_train, runs_val, runs_test, norm)

  model_name = f"{svm_type}_{runs_train+1}_{runs_val+1}_{runs_test+1}_{mask_type}"
  clf = SVC(C = 5.0 , class_weight= 'balanced', max_iter = 1000, random_state = 42) #probability = True
  print(f"Fitting the model for {mask_type}...")
  clf.fit(X, y)
  s3_upload(clf, "models/group/%s.pkl"%model_name, 'pickle')
 
  print("Predicting on Validation set...")
  yval_pred = clf.predict(X_v)
  val_acc = accuracy_score(y_v, yval_pred)
  print("Validation Accuracy:", val_acc)

  print("Predicting on Test set...")
  ytest_pred = clf.predict(X_t)
  test_acc = accuracy_score(y_t, ytest_pred)
  print("Test Accuracy:", test_acc)

  # Save metrics for individual masks
  type_report = ['validation_classreport', 'test_classreport']
  for report in type_report:
    if report == 'validation_classreport':
      class_report = classification_report(y_v, yval_pred, output_dict = True)
    elif report == 'test_classreport':
      class_report = classification_report(y_t, ytest_pred, output_dict = True)

    s3_upload(class_report, f"metrics/group_svm/{svm_type}_{runs_train+1,runs_val+1,runs_test+1}_{mask_type}_{type_report}_classification_report.pkl", "pickle")
    print(f"Classification report for {mask_type} {type_report}")
    print(classification_report(y_v, yval_pred, output_dict = True))

  #Return metrics to save in dataframe for all masks
  return val_acc, test_acc


## Run all Young Adult Train/Val/Test 
- Loop through all the masks
- fit to train
- upload clf to AWS
- calculate the accuracies on Validation and Test sets
- store accuracies in a dictionary, load in pandas and save as csv

In [None]:
file_name = [('whole_brain_all_norm_2d.pkl', "mask"),
            ('all_data_masksubACC_norm_2d.pkl', "subacc"),
            ('all_data_masksubAI_norm_2d.pkl', "subAI"),
            ('all_data_masksubNAcc_norm_2d.pkl', "Naccumb"),
            ('all_data_masksubmPFC_norm_2d.pkl', "PFC")
            ]

#open path dictionary file to get subject ids
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)
subject_ids = data_path_dict['subject_ID']


bool_mat = False
ya = subject_ids[33:]
print("Number of YA subjects: ", len(ya))
group_sub_ids = (ya, ya, ya) # This is not a joke
svm_type = "YA_svm_runs" #other: "Adolescent_svm_runs"
runs_train = 1 #train on run 2
runs_val = 2 #val on run 3
runs_test = 3 #test on run 4 (zero indexing)
norm = "RUNS"

acc_dict = defaultdict(list)

for pkl_file, mask_type in tqdm.tqdm(file_name):
  print("Running", pkl_file, mask_type)
  data = access_load_data(pkl_file, bool_mat)
  val_acc, test_acc = run_grp_svm_model(data, mask_type, group_sub_ids, runs_train, runs_val, runs_test, norm, svm_type)
  data = 0 #conserve RAM space
  acc_dict[f"{svm_type} Mask Type"].append(mask_type)
  acc_dict["Validation Accuracy"].append(val_acc)
  acc_dict['Test Accuracy'].append(test_acc)
  acc_dict['Normalize on'].append(norm)
  acc_dict['Train/Val/Test Runs'].append(f"run {runs_train+1}/run {runs_val+1}/run {runs_test+1}")


# Upload to S3
s3_upload(acc_dict, f"metrics/group_svm/{svm_type}_{runs_train + 1, runs_val + 1, runs_test + 1}_acc_score.pkl", "pickle")

Number of YA subjects:  19


  0%|          | 0/5 [00:00<?, ?it/s]

Running whole_brain_all_norm_2d.pkl mask
Normalizing Each based on RUNS...
X train data shape after concantenation (1596, 237979)
y train data shape after concantenation (1596,)
X test data shape after concantenation (1596, 237979)
y test data shape after concantenation (1596,)
X val data shape after concantenation (1596, 237979)
y val data shape after concantenation (1596,)
Final X Train data shape (1596, 237979)
Final y Train data shape  (1596,)
Final X Val data shape (1596, 237979)
Final y Val data shape  (1596,)
Final X Test data shape (1596, 237979)
Final y Test data shape  (1596,)
Fitting the model for mask...




upload complete for YA_svm_runs_1_2_3_mask.pkl
Predicting on Validation set...
Validation Accuracy: 0.8095238095238095
Predicting on Test set...


 20%|██        | 1/5 [50:40<3:22:42, 3040.54s/it]

Test Accuracy: 0.7161654135338346
Running all_data_masksubACC_norm_2d.pkl subacc
Normalizing Each based on RUNS...
X train data shape after concantenation (1596, 235266)
y train data shape after concantenation (1596,)
X test data shape after concantenation (1596, 235266)
y test data shape after concantenation (1596,)
X val data shape after concantenation (1596, 235266)
y val data shape after concantenation (1596,)
Final X Train data shape (1596, 235266)
Final y Train data shape  (1596,)
Final X Val data shape (1596, 235266)
Final y Val data shape  (1596,)
Final X Test data shape (1596, 235266)
Final y Test data shape  (1596,)
Fitting the model for subacc...




upload complete for YA_svm_runs_1_2_3_subacc.pkl
Predicting on Validation set...
Validation Accuracy: 0.8114035087719298
Predicting on Test set...
Test Accuracy: 0.7142857142857143


 40%|████      | 2/5 [1:39:21<2:28:29, 2969.95s/it]

Running all_data_masksubAI_norm_2d.pkl subAI
Normalizing Each based on RUNS...
X train data shape after concantenation (1596, 237722)
y train data shape after concantenation (1596,)
X test data shape after concantenation (1596, 237722)
y test data shape after concantenation (1596,)
X val data shape after concantenation (1596, 237722)
y val data shape after concantenation (1596,)
Final X Train data shape (1596, 237722)
Final y Train data shape  (1596,)
Final X Val data shape (1596, 237722)
Final y Val data shape  (1596,)
Final X Test data shape (1596, 237722)
Final y Test data shape  (1596,)
Fitting the model for subAI...




upload complete for YA_svm_runs_1_2_3_subAI.pkl
Predicting on Validation set...
Validation Accuracy: 0.8107769423558897
Predicting on Test set...
Test Accuracy: 0.7155388471177945


 60%|██████    | 3/5 [2:30:57<1:40:55, 3027.68s/it]

Running all_data_masksubNAcc_norm_2d.pkl Naccumb
Normalizing Each based on RUNS...
X train data shape after concantenation (1596, 237252)
y train data shape after concantenation (1596,)
X test data shape after concantenation (1596, 237252)
y test data shape after concantenation (1596,)
X val data shape after concantenation (1596, 237252)
y val data shape after concantenation (1596,)
Final X Train data shape (1596, 237252)
Final y Train data shape  (1596,)
Final X Val data shape (1596, 237252)
Final y Val data shape  (1596,)
Final X Test data shape (1596, 237252)
Final y Test data shape  (1596,)
Fitting the model for Naccumb...




upload complete for YA_svm_runs_1_2_3_Naccumb.pkl
Predicting on Validation set...
Validation Accuracy: 0.8082706766917294
Predicting on Test set...
Test Accuracy: 0.718671679197995


 80%|████████  | 4/5 [3:20:57<50:16, 3016.79s/it]  

Running all_data_masksubmPFC_norm_2d.pkl PFC
Normalizing Each based on RUNS...
X train data shape after concantenation (1596, 237722)
y train data shape after concantenation (1596,)
X test data shape after concantenation (1596, 237722)
y test data shape after concantenation (1596,)
X val data shape after concantenation (1596, 237722)
y val data shape after concantenation (1596,)
Final X Train data shape (1596, 237722)
Final y Train data shape  (1596,)
Final X Val data shape (1596, 237722)
Final y Val data shape  (1596,)
Final X Test data shape (1596, 237722)
Final y Test data shape  (1596,)
Fitting the model for PFC...




upload complete for YA_svm_runs_1_2_3_PFC.pkl
Predicting on Validation set...
Validation Accuracy: 0.8076441102756893
Predicting on Test set...
Test Accuracy: 0.7167919799498746


100%|██████████| 5/5 [4:12:14<00:00, 3026.89s/it]


upload complete for YA_svm_runs_acc_score.pkl


True

In [None]:
df = pd.DataFrame(access_load_data(f"metrics/group_svm/{svm_type}_{runs_train + 1, runs_val + 1, runs_test + 1}_acc_score.pkl", False))
df

Unnamed: 0,YA_svm_runs Mask Type,Validation Accuracy,Test Accuracy,Normalize on,Train/Val/Test Runs
0,mask,0.809524,0.716165,RUNS,run 2/run 3/run 4
1,subacc,0.811404,0.714286,RUNS,run 2/run 3/run 4
2,subAI,0.810777,0.715539,RUNS,run 2/run 3/run 4
3,Naccumb,0.808271,0.718672,RUNS,run 2/run 3/run 4
4,PFC,0.807644,0.716792,RUNS,run 2/run 3/run 4


In [None]:
## Train on 16 subjects val on 3 subjects RUN 2; Test on RUN 3

dictionary = {"Mask Type Young Adult": ["Whole Brain","SubACC","SubAI","Nucleus Accumbens","Prefrontal Cortex"], 
              "Validation": [0.753968253968254, 0.753968253968254, 0.753968253968254, 0.7579365079365079, 0.7619047619047619],
              "Test": [0.800125313283208, 0.8020050125313283 , 0.7969924812030075, 0.7994987468671679, 0.7976190476190477]}

df = pd.DataFrame(dictionary)
df.set_index("Mask Type Young Adult", inplace = True)
df

Unnamed: 0_level_0,Validation,Test
Mask Type Young Adult,Unnamed: 1_level_1,Unnamed: 2_level_1
Whole Brain,0.753968,0.800125
SubACC,0.753968,0.802005
SubAI,0.753968,0.796992
Nucleus Accumbens,0.757937,0.799499
Prefrontal Cortex,0.761905,0.797619


## Run all Adolescent Train/Val/Test 
- Loop through all the masks
- fit to train
- upload clf to AWS
- calculate the accuracies on Validation and Test sets
- store accuracies in a dictionary, load in pandas and save as csv

In [None]:
file_name = [('whole_brain_all_norm_2d.pkl', "mask"),
            ('all_data_masksubACC_norm_2d.pkl', "subacc"),
            ('all_data_masksubAI_norm_2d.pkl', "subAI"),
            ('all_data_masksubNAcc_norm_2d.pkl', "Naccumb"),
            ('all_data_masksubmPFC_norm_2d.pkl', "PFC")
            ]

#open path dictionary file to get subject ids
path = "data/data_path_dictionary.pkl"
data_path_dict = open_pickle(path)
subject_ids = data_path_dict['subject_ID']


bool_mat = False
adol = subject_ids[:33]
print("Number of Adolescent subjects: ", len(adol))
group_sub_ids = (adol, adol, adol) 
svm_type = "Adolescent_svm_runs" #"YA_svm_runs" "Adolescent_svm_runs"
runs_train = 1 #train on run 2
runs_val = 2 #val on run 3
runs_test = 3 #test on run 4 (zero indexing)
norm = "RUNS"

acc_dict = defaultdict(list)

for pkl_file, mask_type in tqdm.tqdm(file_name):
  print("Running", pkl_file, mask_type)
  data = access_load_data(pkl_file, bool_mat)
  val_acc, test_acc = run_grp_svm_model(data, mask_type, group_sub_ids, runs_train, runs_val, runs_test, norm, svm_type)
  data = 0 #conserve RAM space
  acc_dict[f"{svm_type} Mask Type"].append(mask_type)
  acc_dict["Validation Accuracy"].append(val_acc)
  acc_dict['Test Accuracy'].append(test_acc)
  acc_dict['Normalize on'].append(norm)
  acc_dict['Train/Val/Test Runs'].append(f"run {runs_train + 1}/run {runs_val + 1}/run {runs_test + 1}")


# Upload to S3
s3_upload(acc_dict, f"metrics/group_svm/{svm_type}_{runs_train + 1, runs_val + 1, runs_test + 1}_acc_score.pkl", "pickle")

Number of Adolescent subjects:  33


  0%|          | 0/5 [00:00<?, ?it/s]

Running whole_brain_all_norm_2d.pkl mask
Normalizing Each based on RUNS...
X train data shape after concantenation (2772, 237979)
y train data shape after concantenation (2772,)
X test data shape after concantenation (2772, 237979)
y test data shape after concantenation (2772,)
X val data shape after concantenation (2772, 237979)
y val data shape after concantenation (2772,)
Final X Train data shape (2772, 237979)
Final y Train data shape  (2772,)
Final X Val data shape (2772, 237979)
Final y Val data shape  (2772,)
Final X Test data shape (2772, 237979)
Final y Test data shape  (2772,)
Fitting the model for mask...




upload complete for Adolescent_svm_runs_1_2_3_mask.pkl
Predicting on Validation set...
Validation Accuracy: 0.7308802308802309
Predicting on Test set...
Test Accuracy: 0.658008658008658


 20%|██        | 1/5 [1:53:09<7:32:39, 6789.79s/it]

Running all_data_masksubACC_norm_2d.pkl subacc
Normalizing Each based on RUNS...
X train data shape after concantenation (2772, 235266)
y train data shape after concantenation (2772,)
X test data shape after concantenation (2772, 235266)
y test data shape after concantenation (2772,)
X val data shape after concantenation (2772, 235266)
y val data shape after concantenation (2772,)
Final X Train data shape (2772, 235266)
Final y Train data shape  (2772,)
Final X Val data shape (2772, 235266)
Final y Val data shape  (2772,)
Final X Test data shape (2772, 235266)
Final y Test data shape  (2772,)
Fitting the model for subacc...




upload complete for Adolescent_svm_runs_1_2_3_subacc.pkl
Predicting on Validation set...
Validation Accuracy: 0.733044733044733
Predicting on Test set...
Test Accuracy: 0.6583694083694084


 40%|████      | 2/5 [3:43:09<5:33:53, 6677.83s/it]

Running all_data_masksubAI_norm_2d.pkl subAI
Normalizing Each based on RUNS...
X train data shape after concantenation (2772, 237722)
y train data shape after concantenation (2772,)
X test data shape after concantenation (2772, 237722)
y test data shape after concantenation (2772,)
X val data shape after concantenation (2772, 237722)
y val data shape after concantenation (2772,)
Final X Train data shape (2772, 237722)
Final y Train data shape  (2772,)
Final X Val data shape (2772, 237722)
Final y Val data shape  (2772,)
Final X Test data shape (2772, 237722)
Final y Test data shape  (2772,)
Fitting the model for subAI...




upload complete for Adolescent_svm_runs_1_2_3_subAI.pkl
Predicting on Validation set...
Validation Accuracy: 0.7366522366522367
Predicting on Test set...


 60%|██████    | 3/5 [5:33:59<3:42:10, 6665.30s/it]

Test Accuracy: 0.6601731601731602
Running all_data_masksubNAcc_norm_2d.pkl Naccumb
Normalizing Each based on RUNS...
X train data shape after concantenation (2772, 237252)
y train data shape after concantenation (2772,)
X test data shape after concantenation (2772, 237252)
y test data shape after concantenation (2772,)
X val data shape after concantenation (2772, 237252)
y val data shape after concantenation (2772,)
Final X Train data shape (2772, 237252)
Final y Train data shape  (2772,)
Final X Val data shape (2772, 237252)
Final y Val data shape  (2772,)
Final X Test data shape (2772, 237252)
Final y Test data shape  (2772,)
Fitting the model for Naccumb...




upload complete for Adolescent_svm_runs_1_2_3_Naccumb.pkl
Predicting on Validation set...
Validation Accuracy: 0.7337662337662337
Predicting on Test set...
Test Accuracy: 0.6594516594516594


 80%|████████  | 4/5 [7:25:54<1:51:24, 6684.75s/it]

Running all_data_masksubmPFC_norm_2d.pkl PFC
Normalizing Each based on RUNS...
X train data shape after concantenation (2772, 237722)
y train data shape after concantenation (2772,)
X test data shape after concantenation (2772, 237722)
y test data shape after concantenation (2772,)
X val data shape after concantenation (2772, 237722)
y val data shape after concantenation (2772,)
Final X Train data shape (2772, 237722)
Final y Train data shape  (2772,)
Final X Val data shape (2772, 237722)
Final y Val data shape  (2772,)
Final X Test data shape (2772, 237722)
Final y Test data shape  (2772,)
Fitting the model for PFC...




upload complete for Adolescent_svm_runs_1_2_3_PFC.pkl
Predicting on Validation set...
Validation Accuracy: 0.7344877344877345
Predicting on Test set...


100%|██████████| 5/5 [9:19:42<00:00, 6716.58s/it]

Test Accuracy: 0.6601731601731602





upload complete for Adolescent_svm_runs_acc_score.pkl


True

In [None]:
df = pd.DataFrame(access_load_data(f"metrics/group_svm/{svm_type}_{runs_train + 1, runs_val + 1, runs_test + 1}_acc_score.pkl", False))
df

Unnamed: 0,Adolescent_svm_runs Mask Type,Validation Accuracy,Test Accuracy,Normalize on,Train/Val/Test Runs
0,mask,0.73088,0.658009,RUNS,run 2/run 3/run 4
1,subacc,0.733045,0.658369,RUNS,run 2/run 3/run 4
2,subAI,0.736652,0.660173,RUNS,run 2/run 3/run 4
3,Naccumb,0.733766,0.659452,RUNS,run 2/run 3/run 4
4,PFC,0.734488,0.660173,RUNS,run 2/run 3/run 4
