# ChestX-ray 14 Data Preprocessing
## Madison Moffat-Wild and Rachel Woodside

### https://paperswithcode.com/dataset/chestx-ray14
### https://www.kaggle.com/datasets/nih-chest-xrays/data/data

Resources:
https://machinelearningmastery.com/best-practices-for-preparing-and-augmenting-image-data-for-convolutional-neural-networks/

## Set up

### Import Libraries

In [1]:
import itertools
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import PIL
import PIL.Image
import pathlib
import matplotlib.pyplot as plt

### Initialize Path to Data

In [2]:
# Store path to data folder
#data_folder = "~/ChestXray14Data/"
data_folder = "data/"
data_dir = pathlib.Path(data_folder).with_suffix('')

### Read in Data Frames

In [3]:
# TODO: Read in bbox?
dataEntry = pd.read_csv(data_folder+'Data_Entry_2017.csv')
dataEntry.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


## Data Cleaning

In [4]:
dataEntry = dataEntry.rename(columns={
    "OriginalImagePixelSpacing[x": "OriginalPixelSpacingX",
    "y]": "OriginalPixelSpacingY",
    "OriginalImage[Width": "OriginalImageWidth",
    "Height]": "OriginalImageHeight"
})
dataEntry.drop("Unnamed: 11", axis=1, inplace=True)
dataEntry.head()
# TODO: Check if there are any other column names to fix

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalPixelSpacingX,OriginalPixelSpacingY
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143


In [5]:
# TODO: fixing nonsensical values (e.g. age > 100)
dataEntry['Patient Age'].where(dataEntry['Patient Age'] <= 100, 100, inplace=True)

## Data Reduction

## Data Transformation

### Generate One-Hot Encodings for Output Labels
Reference: https://www.kaggle.com/code/ashishmundu/nih-chest-x-rays-deep-convolutional-network

In [7]:
# Generate one-hot encoding for the labels
# Get all the labels
uniqueLabels = pd.Series(itertools.chain.from_iterable(dataEntry["Finding Labels"].apply(lambda x : x.split('|')))).unique()

# Initialize an empty matrix 
oneHotEncodings = pd.DataFrame(0.0, index=np.arange(len(dataEntry)), columns=uniqueLabels)

# For each row, we get the associated labels and set a 1 to the new corresponding column label 
for index, row in dataEntry.iterrows():
    labels = row["Finding Labels"].split('|')
    for label in labels:
        oneHotEncodings.iloc[index][label] = 1.0

oneHotEncodings.head()

Unnamed: 0,Cardiomegaly,Emphysema,Effusion,No Finding,Hernia,Infiltration,Mass,Nodule,Atelectasis,Pneumothorax,Pleural_Thickening,Pneumonia,Fibrosis,Edema,Consolidation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Data Integration

### Load in Image Data
Reference: https://www.tensorflow.org/tutorials/load_data/images

In [8]:
image_count = len(list(data_dir.glob('*/*/*.png')))
print(image_count)

112120


In [9]:
# Set data loading parameters
batch_size = 32
img_height = 256
img_width = 256
AUTOTUNE = tf.data.AUTOTUNE

In [10]:
list_ds = tf.data.Dataset.list_files(str(data_dir/'*/*/*'), shuffle=False)
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)
for f in list_ds.take(5):
    print(f.numpy())

b'data\\images_011\\images\\00026190_001.png'
b'data\\images_010\\images\\00022459_003.png'
b'data\\images_002\\images\\00001427_000.png'
b'data\\images_006\\images\\00013604_000.png'
b'data\\images_010\\images\\00021779_003.png'


In [12]:
# TODO: modify to retrieve the labels from the dataEntry dataframe
def get_labels(file_path):
    # Convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # The last is the file's name?
    file_name = parts[-1].numpy()
    # file_name holds the string as bytes, need to convert to a string?
    file_name = str(file_name, encoding="utf-8")
    #print(file_name)
    # Get index
    imgIndexInDf = dataEntry[dataEntry["Image Index"] == file_name].index.tolist()[0]
    #print(imgIndexInDf)
    # Index into the one-hot encoding df to return the vector associated with the index
    oneHotVector = oneHotEncodings.loc[imgIndexInDf]
    oneHotTensor = tf.convert_to_tensor(oneHotVector, dtype=tf.float32)
    return oneHotTensor

In [13]:
for f in list_ds.take(5):
    print(get_labels(f))

tf.Tensor([0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(15,), dtype=float32)
tf.Tensor([0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(15,), dtype=float32)
tf.Tensor([0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(15,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(15,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.], shape=(15,), dtype=float32)


In [14]:
def decode_img(img):
    # channels=1 will give a grayscale image
    img = tf.io.decode_png(img, channels=1)
    # Resize the image to the desired size
    return tf.image.resize(img, [img_height, img_width])

In [15]:
for f in list_ds.take(5):
    print(decode_img(tf.io.read_file(f)))

tf.Tensor(
[[[ 0.  ]
  [ 0.  ]
  [ 0.  ]
  ...
  [18.75]
  [19.5 ]
  [19.5 ]]

 [[ 0.  ]
  [ 0.  ]
  [ 0.  ]
  ...
  [18.  ]
  [18.25]
  [18.75]]

 [[ 0.  ]
  [ 0.  ]
  [ 0.  ]
  ...
  [18.  ]
  [18.25]
  [18.75]]

 ...

 [[ 0.  ]
  [ 0.  ]
  [ 0.  ]
  ...
  [24.25]
  [25.25]
  [24.5 ]]

 [[ 0.  ]
  [ 0.  ]
  [ 0.  ]
  ...
  [31.  ]
  [31.  ]
  [30.75]]

 [[ 0.  ]
  [ 0.  ]
  [ 0.  ]
  ...
  [ 8.5 ]
  [ 8.75]
  [ 9.  ]]], shape=(256, 256, 1), dtype=float32)
tf.Tensor(
[[[  8.75]
  [ 13.5 ]
  [ 13.5 ]
  ...
  [ 13.5 ]
  [ 13.5 ]
  [ 13.5 ]]

 [[  8.5 ]
  [ 13.  ]
  [ 13.  ]
  ...
  [ 13.  ]
  [ 13.  ]
  [ 13.  ]]

 [[  8.5 ]
  [ 13.  ]
  [ 13.  ]
  ...
  [ 13.  ]
  [ 13.  ]
  [ 13.  ]]

 ...

 [[ 73.25]
  [115.5 ]
  [122.  ]
  ...
  [ 19.  ]
  [ 19.  ]
  [ 19.  ]]

 [[ 74.5 ]
  [117.25]
  [125.75]
  ...
  [ 19.  ]
  [ 19.  ]
  [ 18.75]]

 [[ 72.25]
  [111.  ]
  [116.25]
  ...
  [ 17.5 ]
  [ 17.5 ]
  [ 17.5 ]]], shape=(256, 256, 1), dtype=float32)
tf.Tensor(
[[[ 27.  ]
  [ 26.25]
  [ 24.

In [18]:
def load_img(file_path):
    # Load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img

In [19]:
label_ds = list_ds.map(lambda x: tf.py_function(func=get_labels, inp=[x], Tout=tf.float32), num_parallel_calls=AUTOTUNE)

In [20]:
print(label_ds)
for label in label_ds.take(5):
    print("Label: ", label.numpy())

<ParallelMapDataset element_spec=TensorSpec(shape=<unknown>, dtype=tf.float32, name=None)>
Label:  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Label:  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Label:  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Label:  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Label:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]


In [21]:
image_ds = list_ds.map(lambda x: tf.py_function(func=load_img, inp=[x], Tout=tf.float32), num_parallel_calls=AUTOTUNE)

In [22]:
print(image_ds)
for image in image_ds.take(5):
    print("Image shape: ", image.numpy().shape)

<ParallelMapDataset element_spec=TensorSpec(shape=<unknown>, dtype=tf.float32, name=None)>
Image shape:  (256, 256, 1)
Image shape:  (256, 256, 1)
Image shape:  (256, 256, 1)
Image shape:  (256, 256, 1)
Image shape:  (256, 256, 1)


In [23]:
complete_ds = tf.data.Dataset.zip(datasets=(image_ds, label_ds))

In [24]:
print(complete_ds)
for pair in complete_ds.take(5):
    print("Image, label vector pair:", pair)

<ZipDataset element_spec=(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None), TensorSpec(shape=<unknown>, dtype=tf.float32, name=None))>
Image, label vector pair: (<tf.Tensor: shape=(256, 256, 1), dtype=float32, numpy=
array([[[ 0.  ],
        [ 0.  ],
        [ 0.  ],
        ...,
        [18.75],
        [19.5 ],
        [19.5 ]],

       [[ 0.  ],
        [ 0.  ],
        [ 0.  ],
        ...,
        [18.  ],
        [18.25],
        [18.75]],

       [[ 0.  ],
        [ 0.  ],
        [ 0.  ],
        ...,
        [18.  ],
        [18.25],
        [18.75]],

       ...,

       [[ 0.  ],
        [ 0.  ],
        [ 0.  ],
        ...,
        [24.25],
        [25.25],
        [24.5 ]],

       [[ 0.  ],
        [ 0.  ],
        [ 0.  ],
        ...,
        [31.  ],
        [31.  ],
        [30.75]],

       [[ 0.  ],
        [ 0.  ],
        [ 0.  ],
        ...,
        [ 8.5 ],
        [ 8.75],
        [ 9.  ]]], dtype=float32)>, <tf.Tensor: shape=(15,), dtype=float32, num

In [25]:
# Split into test and training datasets
val_size = int(image_count * 0.2)
train_ds = complete_ds.skip(val_size)
val_ds = complete_ds.take(val_size)

In [26]:
print(tf.data.experimental.cardinality(train_ds).numpy())
print(tf.data.experimental.cardinality(val_ds).numpy())

89696
22424


In [27]:
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [28]:
def configure_for_performance(ds):
    ds = ds.cache()
    ds = ds.shuffle(buffer_size=1000)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

train_ds = configure_for_performance(train_ds)
val_ds = configure_for_performance(val_ds)

In [None]:
image_batch, label_batch = next(iter(train_ds))

plt.figure(figsize=(10, 10))
for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(image_batch[i].numpy().astype("uint8"))
    label = label_batch[i]
    #plt.title(class_names[label])
    plt.axis("off")

In [None]:
# TODO: Standardize data? Rescaling can be done as layer in model...