## Load Data and Convert MATLAB data files to Numpy Array from S3 Storage
- This is a demonstration for how we can access our data from AWS using our data path dictionary pickle file and convert the .mat data to numpy arrays
- Option to save the numpy array data in ./source/data directory 

In [1]:
# Import libraries
import pickle
from utils import *
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#open pickle file
path = "data/data_path_dictionary.pkl"
file_names_dict = open_pickle(path)

print("Dictionary contents: ")
file_names_dict

Dictionary contents: 


defaultdict(list,
            {'subject_data': ['svm_data/10004_08693/svm_subj_vecs.mat',
              'svm_data/10008_09924/svm_subj_vecs.mat',
              'svm_data/10009_08848/svm_subj_vecs.mat',
              'svm_data/10016_09694/svm_subj_vecs.mat',
              'svm_data/10017_08894/svm_subj_vecs.mat',
              'svm_data/10018_08907/svm_subj_vecs.mat',
              'svm_data/10021_08839/svm_subj_vecs.mat',
              'svm_data/10022_08854/svm_subj_vecs.mat',
              'svm_data/10023_09126/svm_subj_vecs.mat',
              'svm_data/10027_09455/svm_subj_vecs.mat',
              'svm_data/10033_08871/svm_subj_vecs.mat',
              'svm_data/10034_08879/svm_subj_vecs.mat',
              'svm_data/10035_08847/svm_subj_vecs.mat',
              'svm_data/10036_09800/svm_subj_vecs.mat',
              'svm_data/10037_09903/svm_subj_vecs.mat',
              'svm_data/10038_09063/svm_subj_vecs.mat',
              'svm_data/10039_08941/svm_subj_vecs.mat',
              

In [3]:
## SUBJECT DATA - one subject
# Define object data to get from S3, create file path to download subject data

obj = file_names_dict['subject_data'][0]
file_name = f"data/subject_{file_names_dict['subject_ID'][0]}_data.mat"
print("path to subject data we want:", file_name)

path to subject data we want: data/subject_10004_08693_data.mat


In [9]:
%%time
# Get mat data from aws, download to local then load and save as mat_data
mat_data = access_load_data(obj, file_name)
mat_data

CPU times: user 5.14 s, sys: 2.01 s, total: 7.15 s
Wall time: 43.5 s


{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Wed Feb 23 06:31:07 2022',
 '__version__': '1.0',
 '__globals__': [],
 'run_04_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16),
 'run_03_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16),
 'run_02_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16),
 'run_01_vec': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 

In [10]:
##LABEL DATA
# Define object data to get from S3, create file path to download LABELS

obj = file_names_dict['labels'][0]
file_name = "data/labels.mat"
print("path to label data we want:", file_name)

label_data = access_load_data(obj, file_name)
label_data

path to label data we want: data/labels.mat


{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Wed Feb 23 10:28:26 2022',
 '__version__': '1.0',
 '__globals__': [],
 'decrease_onsets': array([[ 24,  48,  60,  72,  96, 132]], dtype=uint8),
 'increase_onsets': array([[  0,  12,  36,  84, 108, 120]], dtype=uint8),
 'rest_ons': array([[  8,  20,  32,  44,  56,  68,  80,  92, 104, 116, 128, 140]],
       dtype=uint8),
 'rt_labels': array([[9999],
        [9999],
        [9999],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [9999],
        [9999],
        [9999],
        [9999],
        [9999],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [9999],
        [9999],
        [9999],
        [9999],
        [9999],
        [   0],
        [   0],
        [   0],
        [   0],
        [   0],
        [   0],
        [   0],
        [9999],
        [9999],
        [9999],
    

### Take a look at the Training data and Labels
- our mat file is now converted to np.array.
- Reshape 2D to 4D
- need to know the x, y, z dimensions of the image to reshape

In [25]:
img_2d = mat_data['run_01_vec']
print("type: ", type(img_2d))
print(f"shape: {img_2d.shape}, time points = {img_2d.shape[0]}, x,y,z = {img_2d.shape[1]} ")
print("This data is represented as voxels by time, where x, y, z are combined")
img_2d

type:  <class 'numpy.ndarray'>
shape: (144, 592895), time points = 144, x,y,z = 592895 
This data is represented as voxels by time, where x, y, z are combined


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int16)

In [26]:
rt_label = label_data['rt_labels']
print("type:", type(rt_label))
print("Label data shape: ", rt_label.shape)

type: <class 'numpy.ndarray'>
Label data shape:  (144, 1)


#### Reshape the 2d image to 4d for visualization

In [None]:
#img_4d = np.reshape(img_2d,(64,64,30,img_2d.shape[0]))
#img_4d.shape

In [None]:
"Plotting the image after Converting from 2D to 4D"
#plt.imshow(img_4d[:,:,layer, time_point], cmap = 'gray')
#plt.axis('off')

### Take a look at the 4d data 
- print shapes
- reshape the 4D data into 2D array
- Depth represents slices/layers through the brain

In [22]:
#height, width, depth, ch =  img_4d
#print(f"Train data shape: height {height}, width {width}, depth {depth}, time points {ch}")

In [None]:
# x, y, z of image
#print("x, y, z, of the image: ", img_4d.shape[:-1])

In [None]:
# num time points
#n_time_points = img_4d.shape[-1]
#n_time_points

In [None]:
# get num of voxels
#num_voxels = np.prod(img_4d.shape[:-1])
#print("number of voxels: ", num_voxels)

In [None]:
#reshape 4d to 2d
#voxels_by_time = img_4d.reshape((num_voxels, n_time_points))
#voxels_by_time

In [None]:
#voxels_by_time.shape

### Save numpy image data locally

In [31]:
path = f"data/subject_{file_names_dict['subject_ID'][0]}_2d.npy"
with open(path, 'wb') as f:
    np.save(f, img_2d)
    
f.close()

In [32]:
with open('data/rt_label.npy', 'wb') as f:
    np.save(f, rt_label)
    
f.close()

In [None]:
# Save 4d img for visual exploratory analysis
#path = f"data/subject_{file_names_dict['subject_ID'][0]}_4d.npy"
#with open('img_4d.npy', 'wb') as f:
    #np.save(f, img)
    
#f.close()