In [25]:
import numpy as np # forlinear algebra
import matplotlib.pyplot as plt #for plotting things
import os
from PIL import Image
import glob
from pathlib import Path
import kagglehub
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img
from sklearn.metrics import classification_report, confusion_matrix

In [26]:
def get_dataset(dataset_name):
    """Check if a dataset with the given name exists locally before downloading it."""
    base_cache_path = os.path.expanduser("~/.cache/kagglehub/datasets")

    # Search for any dataset folder containing the dataset name
    for root, dirs, _ in os.walk(base_cache_path):
        for dir_name in dirs:
            if dataset_name.split("/")[-1] in dir_name:  # Check if dataset name is in folder
                dataset_path = os.path.join(root, dir_name)
                print(f"Dataset already exists at: {dataset_path}")
                return dataset_path

    # If not found, download it
    print("Dataset not found. Downloading...")
    path = kagglehub.dataset_download(dataset_name)
    print("Dataset downloaded to:", path)
    return path

# Example usage
dataset_path = get_dataset("paultimothymooney/chest-xray-pneumonia")


Dataset already exists at: C:\Users\Rik/.cache/kagglehub/datasets\paultimothymooney\chest-xray-pneumonia


In [27]:

# Set base folder
base_path = Path.home() / ".cache" / "kagglehub" / "datasets" / "paultimothymooney" / "chest-xray-pneumonia"
chest_xray_folders = list(base_path.glob("**/chest_xray"))

if not chest_xray_folders:
    raise FileNotFoundError(f"No 'chest_xray' folder found in {base_path}")

base_folder = chest_xray_folders[0]
print(f"Using dataset folder: {base_folder}")

train_folder = base_folder / "train"
val_folder = base_folder / "val"
test_folder = base_folder / "test"

# Check if base_folder exists and display its contents
if base_folder.exists():
    print("Contents of base_folder:", os.listdir(base_folder))
else:
    raise FileNotFoundError(f"Base folder does not exist: {base_folder}")

# Check if directories exist
if not train_folder.exists():
    raise FileNotFoundError(f"Train folder not found: {train_folder}")
if not val_folder.exists():
    raise FileNotFoundError(f"Validation folder not found: {val_folder}")
if not test_folder.exists():
    raise FileNotFoundError(f"Test folder not found: {test_folder}")

print("All dataset folders exist.")

# Image augmentation
train_datagen = ImageDataGenerator(rescale=1./255,
                                   shear_range=0.2,
                                   zoom_range=0.2,
                                   horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1./255)

# Data loaders
training_set = train_datagen.flow_from_directory(
    directory=str(train_folder),
    target_size=(64, 64),
    batch_size=32,
    class_mode='binary')

validation_generator = test_datagen.flow_from_directory(
    directory=str(val_folder),
    target_size=(64, 64),
    batch_size=32,
    class_mode='binary')

test_set = test_datagen.flow_from_directory(
    directory=str(test_folder),
    target_size=(64, 64),
    batch_size=32,
    class_mode='binary')


Using dataset folder: C:\Users\Rik\.cache\kagglehub\datasets\paultimothymooney\chest-xray-pneumonia\versions\2\chest_xray
Contents of base_folder: ['chest_xray', 'test', 'train', 'val', '__MACOSX']
All dataset folders exist.
Found 5216 images belonging to 2 classes.
Found 16 images belonging to 2 classes.
Found 624 images belonging to 2 classes.


In [28]:

cnn = Sequential()

#Convolution
cnn.add(Conv2D(32, (3, 3), activation="relu", input_shape=(64, 64, 3)))

#Pooling
cnn.add(MaxPooling2D(pool_size = (2, 2)))

# 2nd Convolution
cnn.add(Conv2D(32, (3, 3), activation="relu"))

# 2nd Pooling layer
cnn.add(MaxPooling2D(pool_size = (2, 2)))

# Flatten the layer
cnn.add(Flatten())

# Fully Connected Layers
cnn.add(Dense(activation = 'relu', units = 128))
cnn.add(Dense(activation = 'sigmoid', units = 1))

# Compile the Neural network
cnn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [29]:
cnn.summary()

In [31]:
cnn_model = cnn.fit(training_set,
                         steps_per_epoch = 163,
                         epochs = 1,
                         validation_data = validation_generator,
                         validation_steps = 624)

  self._warn_if_super_not_called()


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 743ms/step - accuracy: 0.7835 - loss: 0.4633 - val_accuracy: 0.8125 - val_loss: 0.3852




In [32]:
test_accu = cnn.evaluate(test_set,steps=624)

[1m624/624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.8626 - loss: 0.3234   
