# Chest X-Ray Images (Pneumonia)

Go here [Download Kaggle Datafile](https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia)

## Part 1: Consolidating the File Folders

Then, scroll down and change the paths at the bottom.
Then, run the code.

In [None]:
import os

In [None]:
# locate the images in the folders and subfolders, 
# rename the files after the folder's names, 
# then move all the files to a new folder

def locate_images(start_path, new_folder):
    # set the current working directory to the start path
    os.chdir(start_path)
    # get the current working directory
    cwd = os.getcwd()
    # list all the files and folders in the current working directory
    files = os.listdir(cwd)
    # loop through the files to locate the image files
    for file in files:
        # check if the file is a directory
        if os.path.isdir(file):
             # change the current working directory to the directory
            os.chdir(file)
            # recurse
            start_path = os.getcwd()
            locate_images(start_path, new_folder)
            os.chdir('..')
        # check if the file is an image file
        if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.jpeg'):
            move_image(file, new_folder)

def move_image(file, new_folder):
    print(f"Moving... {file}")
    # get the current working directory
    folder_name = os.getcwd()
    # get the new name of the file
    new_name = os.path.basename(folder_name) + '_' + file
    # rename the file to move it from the current working directory to the new folder
    if not os.path.exists(new_folder + new_name):
        os.rename(file, new_folder + new_name)

def main():
    # Change these paths to match your drive
    start_path = "F:/Downloads/lungs/chest_xray/"
    new_folder = "F:/Downloads/lungs/All/"
    locate_images(start_path, new_folder)
    
# run the main function
if __name__ == "__main__":
    main()

## Part 2: Preprocessing

In [38]:
from PIL import Image
import numpy as np
import os

In [39]:
new_folder = "F:/Downloads/lungs/All/"
images = []
list_of_images = os.listdir(new_folder)
for image in list_of_images:
    images.append(Image.open(new_folder + image))

In [53]:
# Convert all the images to grayscale, most are already, but some are full-color
grayscale_images = []
for image in images:
    grayscale_images.append(image.convert('L'))

In [73]:
# Use a for loop to resize all images
target_size = (250, 250)
resized_images = [img.resize(target_size, resample = Image.LANCZOS) for img in grayscale_images]

In [74]:
# Verify the resizing of all images
# Get all the sizes into a list, then convert to a set
sizes = set([img.size for img in resized_images])
sizes

{(250, 250)}

In [75]:
# Convert all images to floating point numpy arrays
float_images = [np.array(img).astype(np.float32) for img in resized_images]

# Display the pixel values of the first image
print("Pixel Values:")
print(float_images[0])

Pixel Values:
[[29. 33. 32. ... 34. 31. 29.]
 [30. 33. 31. ... 31. 33. 30.]
 [29. 32. 32. ... 31. 33. 29.]
 ...
 [27. 30. 29. ... 33. 34. 30.]
 [26. 30. 29. ... 33. 35. 31.]
 [26. 31. 29. ... 32. 33. 31.]]


In [76]:
# To normalize images to a range between 0 and 1,
# we need to divide all pixel values by the max of 255

normalized_images = [img/255 for img in float_images]

# Display the pixel values of the first image
print("Pixel Values:")
print(normalized_images[0])

Pixel Values:
[[0.11372549 0.12941177 0.1254902  ... 0.13333334 0.12156863 0.11372549]
 [0.11764706 0.12941177 0.12156863 ... 0.12156863 0.12941177 0.11764706]
 [0.11372549 0.1254902  0.1254902  ... 0.12156863 0.12941177 0.11372549]
 ...
 [0.10588235 0.11764706 0.11372549 ... 0.12941177 0.13333334 0.11764706]
 [0.10196079 0.11764706 0.11372549 ... 0.12941177 0.13725491 0.12156863]
 [0.10196079 0.12156863 0.11372549 ... 0.1254902  0.12941177 0.12156863]]


In [77]:
# Convert the list of images to a numpy array where 0 = Normal and 1 = Pneumonia
def feature_value(file):
    if file.startswith('NORMAL'):
        return 0 # Normal
    else:
        return 1 # Pneumonia

# Put the Normal and Pneumonia labels into a numpy array as y
y = np.array([feature_value(file) for file in list_of_images]).reshape(-1,1)
y

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]])

In [78]:
# Convert the list of images to a numpy array as X
X = normalized_images

In [79]:
# Now we'll split our data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Part 3: Augmentation

In [80]:
# Apply augmentation to the whole training dataset
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create an ImageDataGenerator
datagen = ImageDataGenerator(
    rotation_range=20,      # Random rotation (degrees)
    width_shift_range=0.1,  # Random horizontal shift
    height_shift_range=0.1, # Random vertical shift
    shear_range=0.2,        # Shear intensity
    zoom_range=0.2,         # Random zoom
    horizontal_flip=True,   # Random horizontal flip
    vertical_flip=False,    # No vertical flip for face images
    fill_mode='nearest'     # Fill mode for handling newly created pixels
)

# Create variables to hold the X and y training data
X_train_aug = []
y_train_aug = []

# Loop through all the images.
for i in range(len(X_train)):
    # Select the image
    img = X_train[i]
    # Select the label from the training data
    label = y_train[i]
    
    # Add a channel dimension for grayscale images
    img = np.expand_dims(img, axis=-1)  # Add channel dimension

    # Ensure that the input data has the correct shape
    img = np.expand_dims(img, axis=0)  # Add batch dimension

    # Add 5 images for every original image
    for j in range(5):
        # Append a new image to the X list
        X_train_aug.append(datagen.flow(img, batch_size=1).next()[0])
        # Append the label for the original image to the y list
        y_train_aug.append(label)

# Print the length of each list
print(len(X_train_aug))
print(len(y_train_aug))

# took 10 minutes to run

21960
21960


In [81]:
# Reshape test data for the model
X_test_np = []
for img in X_test:
    # Add a channel dimension for grayscale images
    img = np.expand_dims(img, axis=-1)  # Add channel dimension
    # Append the image to the list
    X_test_np.append(img)

# Convert to numpy array
X_test_np = np.array(X_test_np)

# Check the shape of the first image
X_test_np[0].shape

(250, 250, 1)

In [82]:
# Save the state of our data to pickle files so we don't have to do all this again
import pickle

# Save the training data
with open('X_train_aug.pkl', 'wb') as f:
    pickle.dump(X_train_aug, f)
with open('y_train_aug.pkl', 'wb') as f:
    pickle.dump(y_train_aug, f)

# Save the testing data
with open('X_test_np.pkl', 'wb') as f:
    pickle.dump(X_test_np, f)
with open('y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

# Took 8 minutes to run

## Part 4: Creating the model

In [83]:
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import numpy as np

In [91]:
# Convert values to numpy arrays
X_train_aug_np = np.array(X_train_aug)
X_test_np = np.array(X_test_np)
y_train_aug_np = np.array(y_train_aug)
y_test_np = np.array(y_test)

# Split the training dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_aug_np, y_train_aug_np, test_size=0.2, random_state=42)

In [96]:
# print shapes of the training, validation, and test sets
print(X_train.shape)
print(X_val.shape)
print(X_test_np.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test_np.shape)

(17568, 250, 250, 1)
(4392, 250, 250, 1)
(1464, 250, 250, 1)
(17568, 1)
(4392, 1)
(1464, 1)


In [101]:
# Define a CNN model
model = keras.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(250, 250, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # 2 classes
])

In [102]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [103]:
# train the model
def train(model, X_train, y_train, X_val, y_val, epochs):
    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs
    )
    return history

train(model, X_train, y_train, X_val, y_val, epochs=10)

# NOTE: This takes a while to run ~ 10 minutes per epoch. 90 minutes in all.

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x260898c8810>

In [104]:
# Finally, evaluate the model with the test data we originally reserved with the first train_test_split
model.evaluate(X_test_np, y_test_np)



[0.4692945182323456, 0.9200819730758667]