In [35]:
import nibabel as nib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [36]:
start_path = 'DATA/BET_BSE_DATA/'

In [37]:
data = pd.read_csv(start_path+'Label_file.csv')

data.head()

Unnamed: 0,Filename,Recognizable-Facial-Feature,Brain-Feature-Loss
0,IXI369-Guys-0924-T1_bet_03.nii,Yes,No
1,IXI448-HH-2393-T1_bet_07.nii,Yes,No
2,IXI252-HH-1693-T1_bet_08.nii,Yes,No
3,IXI188-Guys-0798-T1_bet_17.nii,Yes,No
4,IXI182-Guys-0792-T1_bet_17.nii,Yes,No


In [38]:
# Split data into two sets, one for each output variable
df_recognize = data.iloc[: , :-1]
df_brainLoss = data.drop('Recognizable-Facial-Feature', axis=1)

df_recognize['Recognizable-Facial-Feature'].value_counts()

Yes    692
No     619
Name: Recognizable-Facial-Feature, dtype: int64

In [39]:
df_brainLoss['Brain-Feature-Loss'].value_counts()

No     707
Yes    604
Name: Brain-Feature-Loss, dtype: int64

Both classes are fairly balanced


## Helper Functions

In [40]:
from scipy import ndimage

# Read nifti file
def read_nifti_file(filepath):
    """Read and load volume"""
    # Read file
    scan = nib.load(filepath).get_fdata()
    return scan


# Normalize using min-max normalization
def normalize(volume):
    """Normalize the volume"""
    min = -1000
    max = 400
    volume[volume < min] = min
    volume[volume > max] = max
    volume = (volume - min) / (max - min)
    return volume.astype("float32")


# Resize image to standard size for all of them
def resize_volume(img):
    depth_factor = 1 / (img.shape[-1] / 64)
    width_factor = 1 / (img.shape[0] / 128)
    height_factor = 1 / (img.shape[1] / 128)
    # Rotate
    img = ndimage.rotate(img, 90, reshape=False)
    # Resize across z-axis
    img = ndimage.zoom(img, (width_factor, height_factor, depth_factor), order=1)
    return img


# All 3 functions above put in one
def process_scan(path):
    """Read and resize volume"""
    # Read scan
    volume = read_nifti_file(path)
    # Normalize
    volume = normalize(volume)
    # Resize width, height and depth
    volume = resize_volume(volume)
    return volume

In [41]:
df_recognize.head()

Unnamed: 0,Filename,Recognizable-Facial-Feature
0,IXI369-Guys-0924-T1_bet_03.nii,Yes
1,IXI448-HH-2393-T1_bet_07.nii,Yes
2,IXI252-HH-1693-T1_bet_08.nii,Yes
3,IXI188-Guys-0798-T1_bet_17.nii,Yes
4,IXI182-Guys-0792-T1_bet_17.nii,Yes


In [42]:
df_recognize.tail()

Unnamed: 0,Filename,Recognizable-Facial-Feature
1306,IXI477-IOP-1141-T1_bse_default.nii,Yes
1307,IXI573-IOP-1155-T1_bse_default.nii,Yes
1308,IXI483-HH-2177-T1_bse_default.nii,No
1309,IXI159-HH-1549-T1_bse_default.nii,Yes
1310,IXI470-IOP-1030-T1_bse_default.nii,Yes


In [43]:
df_brainLoss.head()

Unnamed: 0,Filename,Brain-Feature-Loss
0,IXI369-Guys-0924-T1_bet_03.nii,No
1,IXI448-HH-2393-T1_bet_07.nii,No
2,IXI252-HH-1693-T1_bet_08.nii,No
3,IXI188-Guys-0798-T1_bet_17.nii,No
4,IXI182-Guys-0792-T1_bet_17.nii,No


In [44]:
df_brainLoss.tail()

Unnamed: 0,Filename,Brain-Feature-Loss
1306,IXI477-IOP-1141-T1_bse_default.nii,No
1307,IXI573-IOP-1155-T1_bse_default.nii,No
1308,IXI483-HH-2177-T1_bse_default.nii,Yes
1309,IXI159-HH-1549-T1_bse_default.nii,No
1310,IXI470-IOP-1030-T1_bse_default.nii,No


In [45]:
# Grab paths to each image file
from glob import glob

image_file_paths = glob(start_path+'files/*')

In [46]:
# Check
image_file_paths[0]

'DATA/BET_BSE_DATA/files/IXI088-Guys-0758-T1_bse_less_s46_r1.nii.gz'

===========================================================================================================================================

Running **process_scan()** on our image files and storing the processed images into **np_data**

In [83]:
# Make numpy array of processed data
np_data = np.array([process_scan(path) for path in image_file_paths])

In [86]:
len(image_file_paths)

1311

In [85]:
len(np_data)

1311

In [87]:
np_data[0] #1st image file

array([[[0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.71428573, 0.71428573, 0.71428573, ..., 0.71428573,
         0.71428573, 0.71428573],
        [0.71428573, 0.71428573, 0.71428573, ..., 0.71428573,
         0.71428573, 0.71428573],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ]],

       [[0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.71428573, 0.71428573, 0.71428573, ..., 0.71428573,
         0.71428573, 0.71428573],
        [0.71428573, 0.71428573, 0.71428573, ..., 0.71428573,
         0.71428573, 0.71428573],
        ...,
        [0.71428573, 0.71428573, 0.71428573, ..., 0.71428573,
         0.71428573, 0.71428573],
        [0.7

### Train - Test Split

In [88]:
# Get train (70%) & test (30%) size

print("Train Size: ", round(len(data) * 0.7))
print("Test Size: ", round(len(data) * 0.3))

Train Size:  918
Test Size:  393


In [96]:
data['Recognizable-Facial-Feature'] = data['Recognizable-Facial-Feature'].replace({'Yes': 1, 'No': 0})

===========================================================================================================================================

Problem. Need to assign X & y but the order of the files in the CSV file is not the same as the one in the directory where we grab the files from.

#### For df_recognize

In [53]:
# For df_recognize 
YesScans_recognize = df_recognize[df_recognize['Recognizable-Facial-Feature'] == 'Yes']
NoScans_recognize = df_recognize[df_recognize['Recognizable-Facial-Feature'] == 'No']

In [54]:
YesScans_recognize

Unnamed: 0,Filename,Recognizable-Facial-Feature
0,IXI369-Guys-0924-T1_bet_03.nii,Yes
1,IXI448-HH-2393-T1_bet_07.nii,Yes
2,IXI252-HH-1693-T1_bet_08.nii,Yes
3,IXI188-Guys-0798-T1_bet_17.nii,Yes
4,IXI182-Guys-0792-T1_bet_17.nii,Yes
...,...,...
1305,IXI293-IOP-0876-T1_bse_default.nii,Yes
1306,IXI477-IOP-1141-T1_bse_default.nii,Yes
1307,IXI573-IOP-1155-T1_bse_default.nii,Yes
1309,IXI159-HH-1549-T1_bse_default.nii,Yes


In [55]:
print('Scans with recognizable features: ' + str(len(df_recognize[df_recognize['Recognizable-Facial-Feature'] == 'Yes'])))
print('Scans without recognizable features: ' + str(len(df_recognize[df_recognize['Recognizable-Facial-Feature'] == 'No'])))

Scans with recognizable features: 692
Scans without recognizable features: 619


In [56]:
# Check
# This is the name of each file (we can match by name)
image_file_paths[0][24:]

'IXI088-Guys-0758-T1_bse_less_s46_r1.nii.gz'

In [57]:
# Check
image_file_paths[0]

'DATA/BET_BSE_DATA/files/IXI088-Guys-0758-T1_bse_less_s46_r1.nii.gz'

In [58]:
YesScans_recognize['Filename']

0           IXI369-Guys-0924-T1_bet_03.nii
1             IXI448-HH-2393-T1_bet_07.nii
2             IXI252-HH-1693-T1_bet_08.nii
3           IXI188-Guys-0798-T1_bet_17.nii
4           IXI182-Guys-0792-T1_bet_17.nii
                       ...                
1305    IXI293-IOP-0876-T1_bse_default.nii
1306    IXI477-IOP-1141-T1_bse_default.nii
1307    IXI573-IOP-1155-T1_bse_default.nii
1309     IXI159-HH-1549-T1_bse_default.nii
1310    IXI470-IOP-1030-T1_bse_default.nii
Name: Filename, Length: 692, dtype: object

In [32]:
# Check
image_file_paths[0][24:-3:]

'IXI088-Guys-0758-T1_bse_less_s46_r1.nii'

In [59]:
Yes_file_paths_recognize = []
for path in image_file_paths:
    for filename in YesScans_recognize['Filename']:
        # Check if path is a Yes file or a No file (matching by file_name)
        if path[24:-3:] == filename:
            Yes_file_paths_recognize.append(glob(path))

In [60]:
len(Yes_file_paths_recognize) 

692

In [62]:
Yes_file_paths_recognize[0]

['DATA/BET_BSE_DATA/files/IXI088-Guys-0758-T1_bse_less_s46_r1.nii.gz']

In [63]:
# For No
No_file_paths_recognize = []
for path in image_file_paths:
    for filename in NoScans_recognize['Filename']:
        # Check if path is a Yes file or a No file
        if path[24:-3:] == filename:
            No_file_paths_recognize.append(glob(path))

In [64]:
len(No_file_paths_recognize)

619

In [76]:
No_file_paths_recognize[0]

['DATA/BET_BSE_DATA/files/IXI482-HH-2178-T1_bet_86.nii.gz']

We now call **preprocess_scan** on the image files and sotre the processed files in **Yes_data_recognize** & **No_data_recognize**

In [77]:
# Now we make the numpy arrays

# Yes images
Yes_data_recognize = np.array([process_scan(path[0]) for path in Yes_file_paths_recognize])

In [79]:
# No images
No_data_recognize = np.array([process_scan(path[0]) for path in No_file_paths_recognize])

In [80]:
print(len(Yes_data_recognize), len(No_data_recognize))

692 619


Now we set the labels as 1s & 0s for Yes & NO respectively

**Yes_recognize** gets a value of 1 for every image file in Yes_data_recognize (which is an array of all the processed Yes image files). 

**No_recognize** gets a value of 0 for every image file in No_data_recognize (which is an array of all the processed No image files). 

In [82]:
# We now set the labels
# 1 for recognizing facial features, 0 for not
Yes_recognize = np.array([1 for _ in range(len(Yes_data_recognize))])

No_recognize = np.array([0 for _ in range(len(No_data_recognize))])

In [201]:
#X = np.concatenate((Yes_data_recognize, No_data_recognize), axis=0)
#y = np.concatenate((Yes_recognize, No_recognize), axis=0)

#### Train - Test split the data

In [83]:
# Get train (70%) & test (30%) size

print("Train Size: ", round(len(df_recognize) * 0.7))
print("Test Size: ", round(len(df_recognize) * 0.3))

Train Size:  918
Test Size:  393


In [84]:
print(len(Yes_data_recognize), len(No_data_recognize))

692 619


918 + 393 = 1311 = 692 + 619

459 + 459 = 918

So below, for the train set, we go until the 459th value for the yes data + until the 49th value for the No data = 918 data values
Whatever remains is in the test.


In [85]:
# Train (70%) - test (30%) split


X_train_recognize = np.concatenate((Yes_data_recognize[:459], No_data_recognize[:459]), axis=0)
y_train_recognize = np.concatenate((Yes_recognize[:459], No_recognize[:459]), axis=0)
X_test_recognize = np.concatenate((Yes_data_recognize[459:], No_data_recognize[459:]), axis=0)
y_test_recognize = np.concatenate((Yes_recognize[459:], No_recognize[459:]), axis=0)

print(f"Training dimensions: {X_train_recognize.shape}\nTesting dimensions {X_test_recognize.shape}")

Training dimensions: (918, 128, 128, 64)
Testing dimensions (393, 128, 128, 64)


In [86]:
with open('X_train_recognize.pickle', 'wb') as f:
    pickle.dump(X_train_recognize, f)
with open('y_train_recognize.pickle', 'wb') as f:
    pickle.dump(y_train_recognize, f)
with open('X_test_recognize.pickle', 'wb') as f:
    pickle.dump(X_test_recognize, f)
with open('y_test_recognize.pickle', 'wb') as f:
    pickle.dump(y_test_recognize, f)


#### For df_brainLoss

In [87]:
df_brainLoss.head()

Unnamed: 0,Filename,Brain-Feature-Loss
0,IXI369-Guys-0924-T1_bet_03.nii,No
1,IXI448-HH-2393-T1_bet_07.nii,No
2,IXI252-HH-1693-T1_bet_08.nii,No
3,IXI188-Guys-0798-T1_bet_17.nii,No
4,IXI182-Guys-0792-T1_bet_17.nii,No


In [88]:
# For df_brainLoss
YesScans_brainLoss = df_brainLoss[df_brainLoss['Brain-Feature-Loss'] == 'Yes']
NoScans_brainLoss = df_brainLoss[df_brainLoss['Brain-Feature-Loss'] == 'No']

In [89]:
YesScans_brainLoss

Unnamed: 0,Filename,Brain-Feature-Loss
215,IXI049-HH-1358-T1_bet_18.nii,Yes
322,IXI250-Guys-0836-T1_bet_78.nii,Yes
323,IXI470-IOP-1030-T1_bet_78.nii,Yes
324,IXI089-Guys-0757-T1_bet_77.nii,Yes
325,IXI053-Guys-0727-T1_bet_9.nii,Yes
...,...,...
1269,IXI536-Guys-1059-T1_bse_high_s71_r2.nii,Yes
1292,IXI216-HH-1635-T1_bse_default.nii,Yes
1297,IXI015-HH-1258-T1_bse_default.nii,Yes
1300,IXI397-Guys-0953-T1_bse_default.nii,Yes


In [90]:
print('Scans with brain feature loss: ' + str(len(df_brainLoss[df_brainLoss['Brain-Feature-Loss'] == 'Yes'])))
print('Scans without brain feature loss: ' + str(len(df_brainLoss[df_brainLoss['Brain-Feature-Loss'] == 'No'])))

Scans with brain feature loss: 604
Scans without brain feature loss: 707


In [95]:
# Check
# This is the name of each file (we can match by name)
image_file_paths[0][24:-3:]

'IXI088-Guys-0758-T1_bse_less_s46_r1.nii'

In [96]:
Yes_file_paths_brainLoss = []
for path in image_file_paths:
    for filename in YesScans_brainLoss['Filename']:
        # Check if path is a Yes file or a No file (matching by file_name)
        if path[24:-3:] == filename:
            Yes_file_paths_brainLoss.append(glob(path))

In [97]:
# For No
No_file_paths_brainLoss = []
for path in image_file_paths:
    for filename in NoScans_brainLoss['Filename']:
        # Check if path is a Yes file or a No file
        if path[24:-3:] == filename:
            No_file_paths_brainLoss.append(glob(path))

In [98]:
print(len(Yes_file_paths_brainLoss), len(No_file_paths_brainLoss))

604 707


In [99]:
# Now we make the numpy arrays

# Yes images
Yes_data_brainLoss = np.array([process_scan(path[0]) for path in Yes_file_paths_brainLoss])

In [100]:
# No images
No_data_brainLoss = np.array([process_scan(path[0]) for path in No_file_paths_brainLoss])

Now we set the labels as 0s & 1s for Yes & NO respectively

**Yes_brainLoss** gets a value of 0 for every image file in Yes_data_recognize (which is an array of all the processed Yes image files). 

**No_brainLoss** gets a value of 1 for every image file in No_data_recognize (which is an array of all the processed No image files). 

In [101]:
# We now set the labels
# 1 for no loss of brain features, 0 for loss of brain features
Yes_brainLoss = np.array([0 for _ in range(len(Yes_data_brainLoss))])

No_brainLoss = np.array([1 for _ in range(len(No_data_brainLoss))])

In [102]:
print(len(Yes_brainLoss), len(No_brainLoss))

604 707


In [103]:
# Get train (70%) & test (30%) size

print("Train Size: ", round(len(df_brainLoss) * 0.7))
print("Test Size: ", round(len(df_brainLoss) * 0.3))

Train Size:  918
Test Size:  393


In [104]:
# Train (70%) - test (30%) split (918/2 = 459)


X_train_brainLoss = np.concatenate((Yes_data_brainLoss[:459], No_data_brainLoss[:459]), axis=0)
y_train_brainLoss = np.concatenate((Yes_brainLoss[:459], No_brainLoss[:459]), axis=0)
X_test_brainLoss = np.concatenate((Yes_data_brainLoss[459:], No_data_brainLoss[459:]), axis=0)
y_test_brainLoss = np.concatenate((Yes_brainLoss[459:], No_brainLoss[459:]), axis=0)

print(f"Training dimensions: {X_train_brainLoss.shape}\nTesting dimensions {X_test_brainLoss.shape}")

Training dimensions: (918, 128, 128, 64)
Testing dimensions (393, 128, 128, 64)


In [105]:
# Saving the train and test data as pickle files
with open('X_train_brainLoss.pickle', 'wb') as f:
    pickle.dump(X_train_brainLoss, f)
with open('y_train_brainLoss.pickle', 'wb') as f:
    pickle.dump(y_train_brainLoss, f)
with open('X_test_brainLoss.pickle', 'wb') as f:
    pickle.dump(X_test_brainLoss, f)
with open('y_test_brainLoss.pickle', 'wb') as f:
    pickle.dump(y_test_brainLoss, f)