# ChestX-ray 14 Data Preprocessing
## Madison Moffat-Wild and Rachel Woodside

### https://paperswithcode.com/dataset/chestx-ray14
### https://www.kaggle.com/datasets/nih-chest-xrays/data/data

Resources:
https://machinelearningmastery.com/best-practices-for-preparing-and-augmenting-image-data-for-convolutional-neural-networks/

## Set up

### Import Libraries

In [1]:
import itertools
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import PIL
import PIL.Image
import pathlib

### Initialize Path to Data

In [2]:
# Store path to data folder
#data_folder = "~/ChestXray14Data/"
data_folder = "data/"
data_dir = pathlib.Path(data_folder).with_suffix('')

### Read in Data Frames

In [3]:
# TODO: Read in bbox?
dataEntry = pd.read_csv(data_folder+'Data_Entry_2017.csv')
dataEntry.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


## Data Cleaning

In [4]:
dataEntry = dataEntry.rename(columns={
    "OriginalImagePixelSpacing[x": "OriginalPixelSpacingX",
    "y]": "OriginalPixelSpacingY",
    "OriginalImage[Width": "OriginalImageWidth",
    "Height]": "OriginalImageHeight"
})
dataEntry.drop("Unnamed: 11", axis=1, inplace=True)
dataEntry.head()
# TODO: Check if there are any other column names to fix

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalPixelSpacingX,OriginalPixelSpacingY
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143


In [5]:
# TODO: fixing nonsensical values (e.g. age > 100)

## Data Reduction

## Data Transformation

### Generate One-Hot Encodings for Output Labels
Reference: https://www.kaggle.com/code/ashishmundu/nih-chest-x-rays-deep-convolutional-network

In [6]:
# TODO: Normalize pixel values?
# TODO: Reshape images

In [7]:
### Generate one-hot encoding for the labels

# Get all the label
uniqueLabels = pd.Series(itertools.chain.from_iterable(dataEntry["Finding Labels"].apply(lambda x : x.split('|')))).unique()

# Initialize an empty matrix 
oneHotEncodings = pd.DataFrame(0.0, index=np.arange(len(dataEntry)), columns=uniqueLabels)

# For each row, we get the associated labels and set a 1 to the new corresponding column label 
for index, row in dataEntry.iterrows():
    labels = row["Finding Labels"].split('|')
    for label in labels:
        oneHotEncodings.iloc[index][label] = 1.0

# Then, we concatenate this new dataframe to our original data
#dataEntry = pd.concat([dataEntry, oneHotEncodings], axis=1)

oneHotEncodings.head()

Unnamed: 0,Cardiomegaly,Emphysema,Effusion,No Finding,Hernia,Infiltration,Mass,Nodule,Atelectasis,Pneumothorax,Pleural_Thickening,Pneumonia,Fibrosis,Edema,Consolidation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Data Integration

### Load in Image Data
Reference: https://www.tensorflow.org/tutorials/load_data/images

In [8]:
image_count = len(list(data_dir.glob('*/*/*.png')))
print(image_count)

112120


In [9]:
list_ds = tf.data.Dataset.list_files(str(data_dir/'*/*/*'), shuffle=False)
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)
for f in list_ds.take(5):
    print(f.numpy())


b'data\\images_012\\images\\00030096_000.png'
b'data\\images_007\\images\\00014557_001.png'
b'data\\images_009\\images\\00018464_000.png'
b'data\\images_005\\images\\00010756_000.png'
b'data\\images_008\\images\\00016224_002.png'


In [10]:
# val_size = int(image_count * 0.2)
# train_ds = list_ds.skip(val_size)
# val_ds = list_ds.take(val_size)

In [11]:
# print(tf.data.experimental.cardinality(train_ds).numpy())
# print(tf.data.experimental.cardinality(val_ds).numpy())

In [12]:
import numpy.lib.recfunctions

In [30]:
# TODO: modify to retrieve the labels from the dataEntry dataframe
def get_labels(file_path):
    # Convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # The last is the file's name?
    file_name = parts[-1].numpy()
    # file_name holds the string as bytes, need to convert to a string?
    file_name = str(file_name, encoding="utf-8")
    #print(file_name)
    # Get index
    imgIndexInDf = dataEntry[dataEntry["Image Index"] == file_name].index.tolist()[0]
    #print(imgIndexInDf)
    # Index into the one-hot encoding df to return the vector associated with the index
    oneHotVector = oneHotEncodings.loc[imgIndexInDf]
    oneHotTensor = tf.convert_to_tensor(oneHotVector, dtype=tf.float32)
    return oneHotTensor

In [31]:
for f in list_ds.take(5):
    print(get_labels(f))

tf.Tensor([0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(15,), dtype=float32)
tf.Tensor([0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(15,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(15,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.], shape=(15,), dtype=float32)
tf.Tensor([0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(15,), dtype=float32)


In [15]:
def decode_img(img):
    # channels=1 will give a grayscale image
    img = tf.io.decode_png(img, channels=1)
    # Resize the image to the desired size
    img_height = 256
    img_width = 256
    return tf.image.resize(img, [img_height, img_width])

In [16]:
for f in list_ds.take(5):
    print(decode_img(tf.io.read_file(f)))

tf.Tensor(
[[[19.5 ]
  [67.  ]
  [53.75]
  ...
  [83.25]
  [83.  ]
  [ 2.  ]]

 [[18.5 ]
  [66.25]
  [53.25]
  ...
  [53.75]
  [54.75]
  [ 1.25]]

 [[18.75]
  [64.75]
  [52.75]
  ...
  [37.25]
  [38.5 ]
  [ 1.  ]]

 ...

 [[13.  ]
  [11.5 ]
  [11.5 ]
  ...
  [13.  ]
  [13.75]
  [21.75]]

 [[14.75]
  [13.5 ]
  [13.75]
  ...
  [11.5 ]
  [13.5 ]
  [22.25]]

 [[17.25]
  [16.75]
  [16.5 ]
  ...
  [11.  ]
  [13.75]
  [22.5 ]]], shape=(256, 256, 1), dtype=float32)
tf.Tensor(
[[[ 16.25]
  [  0.  ]
  [  0.  ]
  ...
  [167.5 ]
  [175.75]
  [178.  ]]

 [[ 15.25]
  [  0.  ]
  [  0.  ]
  ...
  [ 39.75]
  [ 38.75]
  [ 42.25]]

 [[ 15.25]
  [  0.  ]
  [  0.  ]
  ...
  [ 15.5 ]
  [ 12.25]
  [  7.25]]

 ...

 [[  7.5 ]
  [  7.5 ]
  [  7.25]
  ...
  [  3.  ]
  [  3.  ]
  [  4.  ]]

 [[ 15.  ]
  [ 14.  ]
  [ 14.75]
  ...
  [  3.  ]
  [  3.  ]
  [  3.75]]

 [[ 29.25]
  [ 28.25]
  [ 28.75]
  ...
  [  3.  ]
  [  3.  ]
  [  4.  ]]], shape=(256, 256, 1), dtype=float32)
tf.Tensor(
[[[ 0.  ]
  [28.75]
  [17.75]

In [35]:
def load_img(file_path):
    # Load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img

In [18]:
for f in list_ds.take(5):
    print(process_path(f))

tf.Tensor([0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(15,), dtype=float32)
(<tf.Tensor: shape=(256, 256, 1), dtype=float32, numpy=
array([[[19.5 ],
        [67.  ],
        [53.75],
        ...,
        [83.25],
        [83.  ],
        [ 2.  ]],

       [[18.5 ],
        [66.25],
        [53.25],
        ...,
        [53.75],
        [54.75],
        [ 1.25]],

       [[18.75],
        [64.75],
        [52.75],
        ...,
        [37.25],
        [38.5 ],
        [ 1.  ]],

       ...,

       [[13.  ],
        [11.5 ],
        [11.5 ],
        ...,
        [13.  ],
        [13.75],
        [21.75]],

       [[14.75],
        [13.5 ],
        [13.75],
        ...,
        [11.5 ],
        [13.5 ],
        [22.25]],

       [[17.25],
        [16.75],
        [16.5 ],
        ...,
        [11.  ],
        [13.75],
        [22.5 ]]], dtype=float32)>, <tf.Tensor: shape=(15,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype

In [57]:
label_ds = list_ds.map(lambda x: tf.py_function(func=get_labels, inp=[x], Tout=tf.float32), num_parallel_calls=10)

In [58]:
print(label_ds)
for label in label_ds.take(5):
    print("Label: ", label.numpy())

<ParallelMapDataset element_spec=TensorSpec(shape=<unknown>, dtype=tf.float32, name=None)>
Label:  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
Label:  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Label:  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Label:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
Label:  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [37]:
image_ds = list_ds.map(lambda x: tf.py_function(func=load_img, inp=[x], Tout=tf.float32), num_parallel_calls=10)

In [51]:
print(image_ds)
for image in image_ds.take(5):
    print("Image shape: ", image.numpy().shape)

<ParallelMapDataset element_spec=TensorSpec(shape=<unknown>, dtype=tf.float32, name=None)>
Image shape:  (256, 256, 1)
Image shape:  (256, 256, 1)
Image shape:  (256, 256, 1)
Image shape:  (256, 256, 1)
Image shape:  (256, 256, 1)


In [59]:
complete_ds = tf.data.Dataset.zip(datasets=(image_ds, label_ds))

In [62]:
print(complete_ds)
for pair in complete_ds.take(5):
    print("Image, label vector pair:", pair)

<ZipDataset element_spec=(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None), TensorSpec(shape=<unknown>, dtype=tf.float32, name=None))>
Image, label vector pair: (<tf.Tensor: shape=(256, 256, 1), dtype=float32, numpy=
array([[[19.5 ],
        [67.  ],
        [53.75],
        ...,
        [83.25],
        [83.  ],
        [ 2.  ]],

       [[18.5 ],
        [66.25],
        [53.25],
        ...,
        [53.75],
        [54.75],
        [ 1.25]],

       [[18.75],
        [64.75],
        [52.75],
        ...,
        [37.25],
        [38.5 ],
        [ 1.  ]],

       ...,

       [[13.  ],
        [11.5 ],
        [11.5 ],
        ...,
        [13.  ],
        [13.75],
        [21.75]],

       [[14.75],
        [13.5 ],
        [13.75],
        ...,
        [11.5 ],
        [13.5 ],
        [22.25]],

       [[17.25],
        [16.75],
        [16.5 ],
        ...,
        [11.  ],
        [13.75],
        [22.5 ]]], dtype=float32)>, <tf.Tensor: shape=(15,), dtype=float32, num