# Access S3 Storage and Extract Paths and Subject IDs, Load Data
- This is a demo notebook for how the function *access_load_data()* works in utils.py
- Requires local storage of path_config.py in ./source directory
- Saves dictionary pickle file of subject .mat data, subject IDs, mask .mat data, and labels in ./source/data directory

In [24]:
from path_config import mat_path
import boto3
from collections import defaultdict
import re
import pickle
from utils import *

#### Access S3 file storage and check buckets and objects needed

In [25]:
# Acces AWS S3 MATLAB file 
pubkey = mat_path['ACCESS_KEY']
seckey = mat_path['SECRET_KEY']
client = boto3.client('s3', aws_access_key_id = pubkey, aws_secret_access_key = seckey)
#response = client.list_buckets()
s3 = boto3.resource('s3', aws_access_key_id = pubkey, aws_secret_access_key = seckey)

In [26]:
list(s3.buckets.all())

[s3.Bucket(name='aws-cloudtrail-logs-598991111123-85ddbc5c'),
 s3.Bucket(name='clockdrawingbattery'),
 s3.Bucket(name='clockdrawingimages'),
 s3.Bucket(name='clockdrawingimages1'),
 s3.Bucket(name='clockdrawingimages2'),
 s3.Bucket(name='clockdrawingimages3'),
 s3.Bucket(name='clockimages'),
 s3.Bucket(name='healthyhomes591'),
 s3.Bucket(name='teambrainiac'),
 s3.Bucket(name='test-bucket-clockids-aicrowd')]

In [27]:
bucket = s3.Bucket('teambrainiac')
print("bucket name: ", bucket)
bucket_ = bucket.name # 'teambrainiac'

bucket name:  s3.Bucket(name='teambrainiac')


In [6]:
obj_name = list(bucket.objects.all())
obj = obj_name[0].key
print('length of object list', len(obj_name))
print("obj_name", obj_name)


length of object list 148
obj_name [s3.ObjectSummary(bucket_name='teambrainiac', key='all_data.mat'), s3.ObjectSummary(bucket_name='teambrainiac', key='all_data_dictionary.pkl'), s3.ObjectSummary(bucket_name='teambrainiac', key='all_data_masksubACC_norm_2d.pkl'), s3.ObjectSummary(bucket_name='teambrainiac', key='all_data_masksubAI_norm_2d.pkl'), s3.ObjectSummary(bucket_name='teambrainiac', key='all_data_masksubNAcc_norm_2d.pkl'), s3.ObjectSummary(bucket_name='teambrainiac', key='all_data_masksubmPFC_norm_2d.pkl'), s3.ObjectSummary(bucket_name='teambrainiac', key='fake_4d.npy'), s3.ObjectSummary(bucket_name='teambrainiac', key='metrics/'), s3.ObjectSummary(bucket_name='teambrainiac', key='metrics/acc_score.pkl'), s3.ObjectSummary(bucket_name='teambrainiac', key='metrics/accuracy_score.csv'), s3.ObjectSummary(bucket_name='teambrainiac', key='metrics/group_svm/'), s3.ObjectSummary(bucket_name='teambrainiac', key='metrics/group_svm/Adolescent_[2, 3]_whole_brain_test_classreport.csv'), s3.O

#### Extract the path names and subject IDs and store into a dictionary of lists

In [15]:
# Create a dictionary to store data values, subject IDs
data_path_dictionary = defaultdict(list)

# String vars
substring_data = 'svm_subj_vecs.mat'
substring_mask = 'masks.mat'
substring_label = 'rt_labels.mat'
sub_ID_regex = r"(\d{5}_\d{5})" # extracts 10 digit ID separated in middle by underscore 

# Populate the dictionary
for i in obj_name:
    #print(type(i.key))
    #print(i.key)
    if substring_data in i.key:
        data_path_dictionary['subject_data'].append(i.key)
        data_path_dictionary['subject_ID'].extend(re.findall(sub_ID_regex, i.key))
    if substring_mask in i.key:
        data_path_dictionary['mask_data'].append(i.key)
    if substring_label in i.key:
        data_path_dictionary['labels'].append(i.key)
        

In [16]:
data_path_dictionary

defaultdict(list,
            {'subject_data': ['svm_data/10004_08693/svm_subj_vecs.mat',
              'svm_data/10008_09924/svm_subj_vecs.mat',
              'svm_data/10009_08848/svm_subj_vecs.mat',
              'svm_data/10016_09694/svm_subj_vecs.mat',
              'svm_data/10017_08894/svm_subj_vecs.mat',
              'svm_data/10018_08907/svm_subj_vecs.mat',
              'svm_data/10021_08839/svm_subj_vecs.mat',
              'svm_data/10022_08854/svm_subj_vecs.mat',
              'svm_data/10023_09126/svm_subj_vecs.mat',
              'svm_data/10027_09455/svm_subj_vecs.mat',
              'svm_data/10033_08871/svm_subj_vecs.mat',
              'svm_data/10034_08879/svm_subj_vecs.mat',
              'svm_data/10035_08847/svm_subj_vecs.mat',
              'svm_data/10036_09800/svm_subj_vecs.mat',
              'svm_data/10037_09903/svm_subj_vecs.mat',
              'svm_data/10038_09063/svm_subj_vecs.mat',
              'svm_data/10039_08941/svm_subj_vecs.mat',
              

#### Save the dictionary as a pickle file in source/data


In [19]:
pkl_created = False
if pkl_created == False:
    f = open("data/data_path_dictionary.pkl", "wb")
    pickle.dump(data_path_dictionary, f)
    f.close()
    pkl_created = True

In [20]:
#open pickle file
path = "data/data_path_dictionary.pkl"
file_names_dict = open_pickle(path)

print("Dictionary contents: ")
file_names_dict

Dictionary contents: 


defaultdict(list,
            {'subject_data': ['svm_data/10004_08693/svm_subj_vecs.mat',
              'svm_data/10008_09924/svm_subj_vecs.mat',
              'svm_data/10009_08848/svm_subj_vecs.mat',
              'svm_data/10016_09694/svm_subj_vecs.mat',
              'svm_data/10017_08894/svm_subj_vecs.mat',
              'svm_data/10018_08907/svm_subj_vecs.mat',
              'svm_data/10021_08839/svm_subj_vecs.mat',
              'svm_data/10022_08854/svm_subj_vecs.mat',
              'svm_data/10023_09126/svm_subj_vecs.mat',
              'svm_data/10027_09455/svm_subj_vecs.mat',
              'svm_data/10033_08871/svm_subj_vecs.mat',
              'svm_data/10034_08879/svm_subj_vecs.mat',
              'svm_data/10035_08847/svm_subj_vecs.mat',
              'svm_data/10036_09800/svm_subj_vecs.mat',
              'svm_data/10037_09903/svm_subj_vecs.mat',
              'svm_data/10038_09063/svm_subj_vecs.mat',
              'svm_data/10039_08941/svm_subj_vecs.mat',
              

#### Define object data to get from S3, create file path to download subject data

In [18]:
obj = file_names_dict['subject_data'][0] #data object
file_name = f"data/subject_{file_names_dict['subject_ID'][0]}_data.mat"
print("local path to subject data we want:", file_name)

local path to subject data we want: data/subject_10004_08693_data.mat


#### Download data using bucket name, object name and path to download, call load_mat() function module to open mat file

In [19]:
client.download_file(bucket_, obj, file_name)
data = load_mat(file_name)
data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Wed Feb 23 06:31:07 2022',
 '__version__': '1.0',
 '__globals__': [],
 'run_04_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16),
 'run_03_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16),
 'run_02_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16),
 'run_01_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 

#### All the above contained in this module
- the below cell demonstrates the function that performs all the actions in this notebook above:
    - accesses AWS
    - defines the bucket and objects we want
    - loads the mat data into python 
  

In [4]:
bool_mat = True
access_load_data(obj, bool_mat) #saves .mat data in temp file after aws acccess, not locally


{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Wed Feb 23 06:31:07 2022',
 '__version__': '1.0',
 '__globals__': [],
 'run_04_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16),
 'run_03_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16),
 'run_02_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16),
 'run_01_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 

'teambrainiac'