## Transform/Preprocess Data

In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import pydicom
from pydicom.pixel_data_handlers.util import apply_modality_lut


In [24]:
# definte training and test paths for data input/output
train_path = '/Users/ryanlussier/ICH Detection and Classification/data/train/'
test_path = '/Users/ryanlussier/ICH Detection and Classification/data/test/'

# create directory object for training data
directory = os.fsencode(train_path)

ids = []
images = []

#loop through all files in training data directory
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".dcm"):
        # read dcm file
        scan = pydicom.dcmread(train_path + filename)
        ids.append(filename.split('.')[0])
        # extract pixel array
        image = scan.pixel_array
        
        # transform to hounsfield scale
        image = apply_modality_lut(image, scan).astype(dtype='float16')
        
        # max normalize each image
        mx = np.max(np.abs(image))
        image_normed = image/mx
        
        #append pixel array to images list
        images.append(image_normed)
        
        continue
    else:
        continue


In [5]:

images_pkl = "transformed_images.pkl"
image_ids = "image_ids.pkl"

open_file = open(images_pkl, "wb")
pickle.dump(images, open_file)
open_file.close()

open_file = open(image_ids, "wb")
pickle.dump(ids, open_file)
open_file.close()



'\nimages_pkl = "transformed_images.pkl"\nimage_ids = "image_ids.pkl"\n\nopen_file = open(images_pkl, "wb")\npickle.dump(images, open_file)\nopen_file.close()\n\nopen_file = open(image_ids, "wb")\npickle.dump(ids, open_file)\nopen_file.close()\n'

In [6]:
open_file = open(image_ids, "rb")
loaded_list = pickle.load(open_file)
open_file.close()



'open_file = open(image_ids, "rb")\nloaded_list = pickle.load(open_file)\nopen_file.close()'

In [26]:


training_df = pd.read_csv(train_path + 'stage_2_train.csv')
training_df[['ID', 'Subtype']] = training_df.ID.str.rsplit(pat='_', expand=True, n=1)

In [27]:
training_df = training_df.drop_duplicates()
train_pivot = training_df.pivot(index='ID', columns='Subtype', values='Label')

In [28]:
col_names = train_pivot.columns.values.tolist()
train_pivot['Y_vals'] = train_pivot[col_names].to_numpy().tolist()
train_pivot.iloc[14:25]

Subtype,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,Y_vals
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ID_0002081b6,1,0,1,0,0,0,"[1, 0, 1, 0, 0, 0]"
ID_0002108bd,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
ID_000229f2a,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
ID_000230ed7,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
ID_000259ccf,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
ID_00025ef4b,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
ID_000270f8b,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
ID_00027c277,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
ID_00027cbb1,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
ID_000280440,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [30]:
train_Y = train_pivot.Y_vals
train_Y_sub = train_Y[ids]



In [31]:
train_Y_sub[0:5]

ID
ID_e081d0aea    [1, 0, 1, 0, 0, 1]
ID_5d421e995    [0, 0, 0, 0, 0, 0]
ID_aeee86469    [0, 0, 0, 0, 0, 0]
ID_76064057c    [0, 0, 0, 0, 0, 0]
ID_a207005ec    [0, 0, 0, 0, 0, 0]
Name: Y_vals, dtype: object

In [32]:
# build dataframe from images
train_x = pd.DataFrame(list(zip(ids, images)), columns=['ID', 'image_array'])


In [33]:
# join X and y data
mod_df = pd.merge(train_x, train_Y, how='left', left_on='ID', right_index=True)



In [15]:
mod_df.head()

Unnamed: 0,ID,image_array,Y_vals
0,ID_e081d0aea,"[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...","[1, 0, 1, 0, 0, 1]"
1,ID_5d421e995,"[[-0.9596928982725528, -0.9596928982725528, -0...","[0, 0, 0, 0, 0, 0]"
2,ID_aeee86469,"[[-0.6341154090044389, -0.6341154090044389, -0...","[0, 0, 0, 0, 0, 0]"
3,ID_76064057c,"[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...","[0, 0, 0, 0, 0, 0]"
4,ID_a207005ec,"[[-0.755656108597285, -0.755656108597285, -0.7...","[0, 0, 0, 0, 0, 0]"


In [34]:
mod_df1 = mod_df.head(5000)
mod_df1.to_pickle(train_path + 'mod_df1.pickle')

In [35]:
mod_df2 = mod_df.tail(5000)
mod_df2.to_pickle(train_path + 'mod_df2.pickle')