### IMPORTS

In [225]:
import os
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
from PIL import Image
import numpy as np



### HELPER FUNCTIONS

In [226]:
def get_subdirectories(folder_path):
    subdirectory_path_list = []
    subdirectory_names = os.listdir(folder_path)

    for subdirectories in subdirectory_names:
        full_path = os.path.join(folder_path, subdirectories)
        subdirectory_path_list.append(full_path)
    
    return subdirectory_path_list

def get_image(image_file):
    # Load the image
    image = Image.open(image_file)
    # Convert the image to a NumPy array
    image_array = np.array(image)
    return image_array
    

def preprocess_dataset(class_files):
    label_list  = []

    for i in range(len(class_files)):
        subdirectories = get_subdirectories(class_files[i])
        subdirectories.sort()

        class_files[i] = subdirectories
        
        for sub in subdirectories: 
            label_list.append(i)

    #Combining the classes together
    concated_list = []
    for classes in class_files:
        concated_list.extend(classes)

    #Shuffling the classes
    concated_list = np.array(concated_list)
    label_list = np.array(label_list)

    # Generate a random permutation of indices
    indices = np.random.permutation(len(concated_list))

    # Apply the permutation to both lists
    shuffled_class_list = concated_list[indices]
    shuffled_label_list = label_list[indices]
    
    shuffled_class_list = [get_image(image_file) for image_file in shuffled_class_list]

    return shuffled_class_list, shuffled_label_list

In [227]:
path = kagglehub.dataset_download("tawsifurrahman/covid19-radiography-database")
files = os.listdir(path)
print("Path to files", path)
print(files)

Path to files /home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5
['COVID-19_Radiography_Dataset']


In [228]:
main_folder = os.path.join(path, files[0])
files_list = get_subdirectories(main_folder)
print(np.array(files_list))

['/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/Viral Pneumonia'
 '/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/COVID'
 '/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/Viral Pneumonia.metadata.xlsx'
 '/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/Lung_Opacity.metadata.xlsx'
 '/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/COVID.metadata.xlsx'
 '/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/Normal'
 '/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/Lung_O

In [229]:
#Getting the files from the dataset
normal_file = files_list[5]
normal_subdirectories = get_subdirectories(normal_file)
print(normal_subdirectories)

covid_file = files_list[1]
coivd_subdirectories = get_subdirectories(covid_file)
print(coivd_subdirectories)

covid_images = coivd_subdirectories[1]
covid_mask = coivd_subdirectories[0]

normal_images = normal_subdirectories[1]
normal_mask = normal_subdirectories[0]

['/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/Normal/masks', '/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/Normal/images']
['/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/COVID/masks', '/home/smg0092/.cache/kagglehub/datasets/tawsifurrahman/covid19-radiography-database/versions/5/COVID-19_Radiography_Dataset/COVID/images']


In [235]:
class_files = [covid_images, normal_images]
class_files, label_list = preprocess_dataset(class_files)

In [241]:
X, y = np.array(class_files), np.array(label_list)

In [237]:
print(X.shape)
print(y.shape)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

(13808, 299, 299)
(13808,)


In [239]:
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.layers import Dense, SimpleRNN, LSTM
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Add a channel dimension: Shape becomes (100, 299, 299, 1)
X = np.expand_dims(X, axis=-1)
# Normalize pixel values to range [0, 1]
X = X / 255.0

# Build the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(299, 299, 1)),  # Input shape includes the channel
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(10, activation='softmax')  # Adjust the output layer for your number of classes
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 297, 297, 32)      320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 148, 148, 32)     0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 146, 146, 64)      18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 73, 73, 64)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 71, 71, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 35, 35, 128)     

In [240]:
model_hist = model.fit(X_train, y_train, epochs=1, validation_data=(X_test, y_test))

from sklearn.metrics import classification_report

score = model.evaluate(X_test, y_test)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

y_pred = model.predict(X_test)  # Get predictions for the test data
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predictions to class labels
y_true = y_test  # True labels for the test data
label_dict = {"Covid": 0, "Normal": 1}
report = classification_report(y_true, y_pred_classes, target_names=label_dict)
print(report)

Test loss: 0.43929195404052734
Test accuracy: 0.830919623374939
              precision    recall  f1-score   support

       Covid       0.76      0.55      0.64       751
      Normal       0.85      0.94      0.89      2011

    accuracy                           0.83      2762
   macro avg       0.80      0.74      0.76      2762
weighted avg       0.82      0.83      0.82      2762

