# ChestX-ray 14 Data Preprocessing
## Madison Moffat-Wild and Rachel Woodside

### https://paperswithcode.com/dataset/chestx-ray14
### https://www.kaggle.com/datasets/nih-chest-xrays/data/data

Resources:
https://machinelearningmastery.com/best-practices-for-preparing-and-augmenting-image-data-for-convolutional-neural-networks/

## Set up

### Import Libraries

In [1]:
import itertools
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import PIL
import PIL.Image
import pathlib

### Initialize Path to Data

In [2]:
# Store path to data folder
#data_folder = "~/ChestXray14Data/"
data_folder = "data/"
data_dir = pathlib.Path(data_folder).with_suffix('')

### Read in Data Frames

In [53]:
# TODO: Read in bbox?
dataEntry = pd.read_csv(data_folder+'Data_Entry_2017.csv')
dataEntry.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


## Data Cleaning

In [54]:
dataEntry = dataEntry.rename(columns={
    "OriginalImagePixelSpacing[x": "OriginalPixelSpacingX",
    "y]": "OriginalPixelSpacingY",
    "OriginalImage[Width": "OriginalImageWidth",
    "Height]": "OriginalImageHeight"
})
# TODO: Check if there are any other column names to fix

In [None]:
# TODO: fixing nonsensical values (e.g. age > 100)

## Data Reduction

## Data Transformation

### Generate One-Hot Encodings for Output Labels
Reference: https://www.kaggle.com/code/ashishmundu/nih-chest-x-rays-deep-convolutional-network

In [None]:
# TODO: Normalize pixel values?
# TODO: Reshape images

In [55]:
### Generate one-hot encoding for the labels

# Get all the label
uniqueLabels = pd.Series(itertools.chain.from_iterable(dataEntry["Finding Labels"].apply(lambda x : x.split('|')))).unique()

# Initialize an empty matrix 
oneHotEncodings = pd.DataFrame(0.0, index=np.arange(len(dataEntry)), columns=uniqueLabels)

# For each row, we get the associated labels and set a 1 to the new corresponding column label 
for index, row in dataEntry.iterrows():
    labels = row["Finding Labels"].split('|')
    for label in labels:
        oneHotEncodings.iloc[index][label] = 1.0

# Then, we concatenate this new dataframe to our original data
#dataEntry = pd.concat([dataEntry, oneHotEncodings], axis=1)

oneHotEncodings.head()

Unnamed: 0,Cardiomegaly,Emphysema,Effusion,No Finding,Hernia,Infiltration,Mass,Nodule,Atelectasis,Pneumothorax,Pleural_Thickening,Pneumonia,Fibrosis,Edema,Consolidation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Data Integration

### Load in Image Data
Reference: https://www.tensorflow.org/tutorials/load_data/images

In [56]:
image_count = len(list(data_dir.glob('*/*/*.png')))
print(image_count)

112120


In [57]:
list_ds = tf.data.Dataset.list_files(str(data_dir/'*/*/*'), shuffle=False)
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)
for f in list_ds.take(5):
    print(f.numpy())


b'data\\images_004\\images\\00006713_013.png'
b'data\\images_005\\images\\00011370_000.png'
b'data\\images_004\\images\\00007018_045.png'
b'data\\images_004\\images\\00007075_002.png'
b'data\\images_004\\images\\00008295_045.png'


In [140]:
val_size = int(image_count * 0.2)
train_ds = list_ds.skip(val_size)
val_ds = list_ds.take(val_size)

In [141]:
print(tf.data.experimental.cardinality(train_ds).numpy())
print(tf.data.experimental.cardinality(val_ds).numpy())

89696
22424


In [147]:
# TODO: modify to retrieve the labels from the dataEntry dataframe
def get_labels(file_path):
    # Convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # The last is the file's name?
    file_name = parts[-1].numpy()
    # file_name holds the string as bytes, need to convert to a string?
    file_name = str(file_name, encoding="utf-8")
    #print(file_name)
    # Get index
    imgIndexInDf = dataEntry[dataEntry["Image Index"] == file_name].index.tolist()[0]
    #print(imgIndexInDf)
    # Index into the one-hot encoding df to return the vector associated with the index
    return oneHotEncodings.loc[imgIndexInDf]

In [159]:
for f in list_ds.take(5):
    print(get_labels(f))

Cardiomegaly          0.0
Emphysema             0.0
Effusion              0.0
No Finding            0.0
Hernia                1.0
Infiltration          0.0
Mass                  0.0
Nodule                0.0
Atelectasis           0.0
Pneumothorax          0.0
Pleural_Thickening    0.0
Pneumonia             0.0
Fibrosis              0.0
Edema                 0.0
Consolidation         0.0
Name: 25532, dtype: float64
Cardiomegaly          0.0
Emphysema             0.0
Effusion              0.0
No Finding            1.0
Hernia                0.0
Infiltration          0.0
Mass                  0.0
Nodule                0.0
Atelectasis           0.0
Pneumothorax          0.0
Pleural_Thickening    0.0
Pneumonia             0.0
Fibrosis              0.0
Edema                 0.0
Consolidation         0.0
Name: 44106, dtype: float64
Cardiomegaly          0.0
Emphysema             0.0
Effusion              0.0
No Finding            0.0
Hernia                0.0
Infiltration          1.0
Mass    

In [148]:
def decode_img(img):
    # channels=1 will give a grayscale image
    img = tf.io.decode_png(img, channels=1)
    # Resize the image to the desired size
    img_height = 256
    img_width = 256
    return tf.image.resize(img, [img_height, img_width]).numpy()

In [163]:
for f in list_ds.take(5):
    print(decode_img(tf.io.read_file(f)).dtype)

float32
float32
float32
float32
float32


In [149]:
def process_path(file_path):
    label = get_labels(file_path)
    # Load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    print(img, label)
    return img, label

In [150]:
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
#train_ds = train_ds.map_fn(process_path)
#val_ds = val_ds.map_fn(process_path)

In [167]:
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_ds = train_ds.map(lambda x: tf.py_function(func=process_path, inp=[x], Tout=tf.float32), num_parallel_calls=10)
val_ds = val_ds.map(lambda x: tf.py_function(func=process_path, inp=[x], Tout=tf.float32), num_parallel_calls=10)

In [169]:
for f in train_ds.take(5):
    print(f)

[[[  2.  ]
  [  2.  ]
  [  2.  ]
  ...
  [  2.  ]
  [  1.75]
  [  2.75]]

 [[  2.  ]
  [  2.  ]
  [  2.  ]
  ...
  [  1.75]
  [  2.75]
  [  2.25]]

 [[  2.  ]
  [  2.  ]
  [  2.  ]
  ...
  [  2.5 ]
  [  2.5 ]
  [  2.  ]]

 ...

 [[  1.  ]
  [  1.  ]
  [  1.  ]
  ...
  [122.5 ]
  [115.25]
  [109.25]]

 [[  1.  ]
  [  1.  ]
  [  1.  ]
  ...
  [127.25]
  [117.5 ]
  [110.75]]

 [[  1.  ]
  [  1.25]
  [  1.  ]
  ...
  [131.75]
  [124.25]
  [117.75]]] Cardiomegaly          0.0
Emphysema             0.0
Effusion              0.0
No Finding            1.0
Hernia                0.0
Infiltration          0.0
Mass                  0.0
Nodule                0.0
Atelectasis           0.0
Pneumothorax          0.0
Pleural_Thickening    0.0
Pneumonia             0.0
Fibrosis              0.0
Edema                 0.0
Consolidation         0.0
Name: 70417, dtype: float64
[[[ 3.5 ]
  [ 6.25]
  [ 6.  ]
  ...
  [ 5.  ]
  [ 5.  ]
  [ 5.  ]]

 [[ 4.5 ]
  [ 7.25]
  [ 8.  ]
  ...
  [ 5.  ]
  [ 5.  ]
  [ 5.  

InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_1_device_/job:localhost/replica:0/task:0/device:CPU:0}} ValueError: input: Tensor conversion requested dtype string for Tensor with dtype float64: <tf.Tensor: shape=(256, 256, 1), dtype=float64, numpy=
array([[[ 3.5 ],
        [ 6.25],
        [ 6.  ],
        ...,
        [ 5.  ],
        [ 5.  ],
        [ 5.  ]],

       [[ 4.5 ],
        [ 7.25],
        [ 8.  ],
        ...,
        [ 5.  ],
        [ 5.  ],
        [ 5.  ]],

       [[ 6.  ],
        [10.25],
        [11.  ],
        ...,
        [ 5.  ],
        [ 5.  ],
        [ 5.  ]],

       ...,

       [[35.5 ],
        [65.25],
        [74.  ],
        ...,
        [ 7.5 ],
        [ 7.5 ],
        [ 7.  ]],

       [[37.  ],
        [65.  ],
        [73.5 ],
        ...,
        [ 7.75],
        [ 7.  ],
        [ 8.  ]],

       [[39.25],
        [66.25],
        [73.5 ],
        ...,
        [ 7.  ],
        [ 6.5 ],
        [ 7.  ]]])>
Traceback (most recent call last):

  File "C:\Users\Dail_\Anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 269, in __call__
    return func(device, token, args)

  File "C:\Users\Dail_\Anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 147, in __call__
    outputs = self._call(device, args)

  File "C:\Users\Dail_\Anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 154, in _call
    ret = self._func(*args)

  File "C:\Users\Dail_\Anaconda3\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "C:\Users\Dail_\AppData\Local\Temp\ipykernel_22928\3132973860.py", line 2, in process_path
    label = get_labels(file_path)

  File "C:\Users\Dail_\AppData\Local\Temp\ipykernel_22928\2644121961.py", line 4, in get_labels
    parts = tf.strings.split(file_path, os.path.sep)

  File "C:\Users\Dail_\Anaconda3\lib\site-packages\tensorflow\python\util\traceback_utils.py", line 153, in error_handler
    raise e.with_traceback(filtered_tb) from None

  File "C:\Users\Dail_\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1599, in convert_to_tensor
    raise ValueError(

ValueError: input: Tensor conversion requested dtype string for Tensor with dtype float64: <tf.Tensor: shape=(256, 256, 1), dtype=float64, numpy=
array([[[ 3.5 ],
        [ 6.25],
        [ 6.  ],
        ...,
        [ 5.  ],
        [ 5.  ],
        [ 5.  ]],

       [[ 4.5 ],
        [ 7.25],
        [ 8.  ],
        ...,
        [ 5.  ],
        [ 5.  ],
        [ 5.  ]],

       [[ 6.  ],
        [10.25],
        [11.  ],
        ...,
        [ 5.  ],
        [ 5.  ],
        [ 5.  ]],

       ...,

       [[35.5 ],
        [65.25],
        [74.  ],
        ...,
        [ 7.5 ],
        [ 7.5 ],
        [ 7.  ]],

       [[37.  ],
        [65.  ],
        [73.5 ],
        ...,
        [ 7.75],
        [ 7.  ],
        [ 8.  ]],

       [[39.25],
        [66.25],
        [73.5 ],
        ...,
        [ 7.  ],
        [ 6.5 ],
        [ 7.  ]]])>


	 [[{{node EagerPyFunc}}]] [Op:IteratorGetNext]

[[[197.  ]
  [193.25]
  [189.  ]
  ...
  [230.5 ]
  [227.  ]
  [229.25]]

 [[178.75]
  [165.75]
  [157.25]
  ...
  [241.25]
  [241.25]
  [245.  ]]

 [[124.  ]
  [ 83.5 ]
  [ 61.  ]
  ...
  [239.25]
  [238.75]
  [245.25]]

 ...

 [[  6.  ]
  [  6.  ]
  [  6.  ]
  ...
  [  0.  ]
  [  1.  ]
  [  4.5 ]]

 [[  6.  ]
  [  6.  ]
  [  6.  ]
  ...
  [  0.  ]
  [  1.  ]
  [  5.5 ]]

 [[  6.  ]
  [  6.  ]
  [  6.  ]
  ...
  [  0.  ]
  [  1.  ]
  [  5.25]]] Cardiomegaly          0.0
Emphysema             0.0
Effusion              0.0
No Finding            0.0
Hernia                0.0
Infiltration          0.0
Mass                  0.0
Nodule                0.0
Atelectasis           0.0
Pneumothorax          1.0
Pleural_Thickening    0.0
Pneumonia             0.0
Fibrosis              0.0
Edema                 0.0
Consolidation         0.0
Name: 62600, dtype: float64
[[[ 0.  ]
  [ 0.  ]
  [ 0.  ]
  ...
  [ 0.  ]
  [ 0.  ]
  [ 0.  ]]

 [[ 0.  ]
  [ 0.  ]
  [ 0.  ]
  ...
  [ 0.  ]
  [ 0.  ]
  [ 0.  

[[[  0.  ]
  [  0.  ]
  [ 43.75]
  ...
  [ 16.75]
  [ 16.25]
  [ 17.  ]]

 [[  0.  ]
  [  0.  ]
  [ 39.  ]
  ...
  [ 13.75]
  [ 14.  ]
  [ 14.  ]]

 [[  0.  ]
  [  0.  ]
  [ 36.5 ]
  ...
  [ 11.75]
  [ 11.  ]
  [ 11.5 ]]

 ...

 [[  0.  ]
  [  0.  ]
  [ 22.5 ]
  ...
  [183.25]
  [177.75]
  [182.5 ]]

 [[  0.  ]
  [  0.  ]
  [ 23.  ]
  ...
  [182.  ]
  [179.75]
  [181.5 ]]

 [[  0.  ]
  [  0.  ]
  [ 30.5 ]
  ...
  [189.75]
  [185.25]
  [190.  ]]] Cardiomegaly          0.0
Emphysema             0.0
Effusion              0.0
No Finding            0.0
Hernia                0.0
Infiltration          0.0
Mass                  0.0
Nodule                1.0
Atelectasis           0.0
Pneumothorax          0.0
Pleural_Thickening    0.0
Pneumonia             0.0
Fibrosis              0.0
Edema                 0.0
Consolidation         0.0
Name: 43886, dtype: float64
[[[171.  ]
  [167.5 ]
  [169.5 ]
  ...
  [156.  ]
  [160.5 ]
  [165.75]]

 [[158.25]
  [169.5 ]
  [182.25]
  ...
  [146.  ]
  [151.5

[[[121.25]
  [ 84.25]
  [ 62.25]
  ...
  [ 34.  ]
  [ 34.  ]
  [ 35.  ]]

 [[113.25]
  [ 92.25]
  [ 63.25]
  ...
  [ 25.5 ]
  [ 25.75]
  [ 25.5 ]]

 [[120.25]
  [ 84.25]
  [ 65.25]
  ...
  [ 23.75]
  [ 22.  ]
  [ 23.5 ]]

 ...

 [[154.75]
  [161.  ]
  [171.5 ]
  ...
  [ 17.75]
  [ 19.  ]
  [ 19.25]]

 [[161.75]
  [164.25]
  [177.  ]
  ...
  [ 17.5 ]
  [ 17.75]
  [ 18.5 ]]

 [[172.25]
  [177.5 ]
  [186.  ]
  ...
  [ 19.25]
  [ 19.75]
  [ 20.5 ]]] Cardiomegaly          0.0
Emphysema             0.0
Effusion              0.0
No Finding            1.0
Hernia                0.0
Infiltration          0.0
Mass                  0.0
Nodule                0.0
Atelectasis           0.0
Pneumothorax          0.0
Pleural_Thickening    0.0
Pneumonia             0.0
Fibrosis              0.0
Edema                 0.0
Consolidation         0.0
Name: 27708, dtype: float64
[[[  0.  ]
  [  0.  ]
  [  0.  ]
  ...
  [  0.  ]
  [  0.  ]
  [  0.  ]]

 [[  0.  ]
  [  0.  ]
  [  0.  ]
  ...
  [  0.  ]
  [  0. 