### Import Libraries

In [1]:
import numpy as np
import os
from random import shuffle
import random
from tqdm import tqdm
import pickle
import glob

# OpenCV
# import cv2

# Tensorflow
import tensorflow as tf
from tensorflow.keras.applications.densenet import DenseNet169
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet import ResNet101
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D

from sklearn.model_selection import train_test_split

### Read image file paths and store to a list both for covid and non-covid

In [11]:
# dataset_path = "../../dataset/xray"

# all_files = []

# covid_files = glob.glob(os.path.join(dataset_path, "Covid", "*.png"))
# covid_files.extend(glob.glob(os.path.join(dataset_path, "Covid", "*.jpg")))

# non_covid_files = glob.glob(os.path.join(dataset_path, "Noncovid", "normal", "*.png"))
# non_covid_files.extend(glob.glob(os.path.join(dataset_path, "Noncovid", "normal", "*.jpg")))
# non_covid_files.extend(glob.glob(os.path.join(dataset_path, "Noncovid", "pneumonia", "*.png")))
# non_covid_files.extend(glob.glob(os.path.join(dataset_path, "Noncovid", "pneumonia", "*.jpg")))

# # Get all the files from the directory in a two element list. First element is list of file location to covid images and second element is list of file location to non-covid images.
# all_files = [covid_files, non_covid_files]
# print("Image Files Count\nCovid: {}\nNon-Covid: {}\nTotal: {}".format(len(covid_files), len(non_covid_files), len(all_files[0] + all_files[1])))


Image Files Count
Covid: 2126
Non-Covid: 2357
Total: 4483


### Function to read image from file list and store the corresponding label

In [12]:
# def get_dataset(files, label, count, img_size):
#   dataset = []  # List to hold all the dataset. Each element is a dictionary

#   for j in tqdm(files):  # Loop over each file location
#     data_dict = {}
#     data_dict["id"] = count
#     data_dict["filepath"] = j
#     try:
#       img = cv2.imread(j)
#       img = cv2.resize(img, (img_size, img_size))
#       data_dict["image"] = img
#       data_dict["label"] = label
#       count += 1
#       dataset.append(data_dict)
#     except Exception as e:
#       print("faulty image: {} {}".format(j, e))
#   return dataset, count

### Read image from the file and store the corresponding label in a list

In [15]:
# c_dataset, nc_dataset, t_dataset = [], [], []
# labels = [1, 0] # 1 = Covid, 0 = Noncovid
# count = 0       # Count to record the ids of files. Each file has a unique ID.
# img_size = 224
# # all_files => [NC, C]
# for i, data in enumerate(all_files): # only two loops for Covid and Non-Covid
#   dataset, count = get_dataset(data, labels[i], count, img_size)
#   if labels[i] == 1:
#     c_dataset = dataset
#   else:
#     nc_dataset = dataset
# tot_dataset = c_dataset + nc_dataset
# print("Dataset Count\nCovid: {}\nNon-Covid: {}\nTotal: {}".format(len(c_dataset), len(nc_dataset), len(tot_dataset)))

# # save dataset in pickle file
# with open('../../pickle_files/al/x_ray/x_ray_dataset_all.pickle', 'wb') as handle:
#   pickle.dump(tot_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

 21%|████████████████████████████████████                                                                                                                                          | 440/2126 [00:17<00:51, 32.90it/s]

faulty image: ../../dataset/xrays/Covid/MIDRC-RICORD-1C-SITE2-000199-31568-0.png OpenCV(4.9.0) /Users/xperience/GHA-OpenCV-Python2/_work/opencv-python/opencv-python/opencv/modules/imgproc/src/resize.cpp:4152: error: (-215:Assertion failed) !ssize.empty() in function 'resize'



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2126/2126 [01:25<00:00, 24.89it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2357/2357 [01:15<00:00, 31.17it/s]


Dataset Count
Covid: 2125
Non-Covid: 2357
Total: 4482


### Clip the extra data to make both class somehow balanced

In [16]:
# print("Initial Total Dataset: {}".format(len(tot_dataset)))
# c_2125 = tot_dataset[:2125] # first 2125 data is Covid data
# nc_2275 = tot_dataset[2125:4400] # next 2275 data is Non-Covid data
# t_dataset = c_2125 + nc_2275

# # confirm the correct dataset labels
# print(len(c_2125), c_2125[0]["label"], c_2125[2124]["label"])
# print(len(nc_2275), nc_2275[0]["label"], nc_2275[2274]["label"])
# print("After cliping, Total Dataset: {}".format(len(t_dataset)))

# # open dataset from pickle file
# with open('../../pickle_files/al/x_ray/x_ray_dataset_cliped.pickle', 'wb') as handle:
#   pickle.dump(t_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

Initial Total Dataset: 4482
2125 1 1
2275 0 0
After cliping, Total Dataset: 4400


In [18]:
# dataset_file = '../../pickle_files/rl/x_ray/x_ray_dataset_cliped.pickle'
dataset_file = '../../pickle_files/al/x_ray/x_ray_dataset.pickle' # this is also clipped dataset
with open(dataset_file, 'rb') as handle:
  x_ray_dataset = pickle.load(handle)

print(len(x_ray_dataset), x_ray_dataset[0]['label'])

4400 1


### Extract image only from the dataset to send to DNN

In [19]:
x = []
y = []
for data in x_ray_dataset:
  x.append(data["image"])
  y.append(data["label"])
x = np.array(x)
print(len(x), len(y))

4400 4400


### Split training and test/validtion dataset

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
print(len(x_train), len(x_test))

3080 1320


### Define batch size and other constants

In [41]:
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 16
EPOCHS = 20
BASE_MODELS = ["vgg16net", "resnet101", "densenet169"]
seed_no = random.randint(1,100)

### Object to generate batches of images to feed into DNN (training)

In [42]:
train_datagen = ImageDataGenerator(
  rescale=1./255,
  # rotation_range=20,
  # width_shift_range=0.2,
  # height_shift_range=0.2,
  shear_range=0.2,
  zoom_range=0.2,
  horizontal_flip=True,
  fill_mode='nearest'
)

### Object to generate batches of images to feed into DNN (test)

In [43]:
test_datagen = ImageDataGenerator(rescale=1./255)

### Generate batches of images to feed into DNN (training)

In [44]:
train_generator = train_datagen.flow(
  x_train,
  y_train,
  batch_size=BATCH_SIZE)

### Generate batches of images to feed into DNN (test)

In [45]:
test_generator = test_datagen.flow(
  x_test,
  y_test,
  batch_size=BATCH_SIZE)

### Define model getter

In [46]:
def get_model(img_size, model_sel):
  if model_sel == 1:
    vgg_pre_t = VGG16(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet')
    return vgg_pre_t, 25088

  elif model_sel == 2:
    resnet_pre_t = ResNet101(input_shape=(img_size, img_size, 3), include_top=False, weights='imagenet')
    return resnet_pre_t, 100352

  elif model_sel == 3:
    densenet169_pre_t = DenseNet169(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet' )
    return densenet169_pre_t, 81536

### Select model

In [47]:
select_model = 2 # int(input("Enter the number for: \n 1) VGGNET16 \n 2) Resnet101  \n 3) Densenet169 "))

### Load pre-trained selected model without classification layers

In [48]:
base_model, feature_size = get_model(IMAGE_SIZE[0], select_model)

### Add custom classification layers on top of base model

In [49]:
x = base_model.output

x = GlobalAveragePooling2D()(x)

x = Dense(1024, activation='relu')(x)

predictions = Dense(1, activation='sigmoid')(x)
print(x.shape, predictions.shape)

(None, 1024) (None, 1)


### Combine base model and custom layers into a new model

In [50]:
model = Model(inputs=base_model.input, outputs=predictions)

### Freeze the layers of the base model

In [51]:
for layer in base_model.layers:
  layer.trainable = False

### Compile the model

In [52]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

from tensorflow.keras.callbacks import TensorBoard
tensorboard_callback = TensorBoard(log_dir="logs", histogram_freq=1)  # Customize log directory and other options

### Train the model

In [53]:
model.fit(train_generator,
          steps_per_epoch=len(train_generator),
          epochs=EPOCHS,
          validation_data=test_generator,
          validation_steps=len(test_generator),
          callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x327baff10>

### Evaluate the model on test set

In [54]:
test_loss, test_accuracy = model.evaluate(test_generator, steps=len(test_generator))
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9621211886405945


### Save the model for future use

In [55]:
model_name = "../models/xray" + "_old_dataset_" + BASE_MODELS[select_model-1] + "_" + str(BATCH_SIZE) + "_" + str(EPOCHS) + "_expert_acc_" + str(round(test_accuracy * 100, 0)) + ".h5"
model.save(model_name)

### Calculate evalation metrices

In [56]:
from sklearn.metrics import confusion_matrix, precision_score, f1_score, roc_auc_score
from tensorflow.keras.models import load_model

In [57]:
# Load the saved model
loaded_model = load_model(model_name)

In [58]:
learning_rate = loaded_model.optimizer.learning_rate.numpy()
print(learning_rate)

0.001


In [60]:
test_data, test_labels = [], []
for _ in range(len(test_generator)):
    batch_data, batch_labels = test_generator.next()
    test_data.append(batch_data)
    test_labels.append(batch_labels)

test_data = np.concatenate(test_data)
test_labels = np.concatenate(test_labels)

# Make predictions on the test set
predictions = loaded_model.predict(test_data)




In [61]:
# Convert predicted probabilities to predicted class labels
predicted_labels = (predictions > 0.5).astype(int)  # Thresholding at 0.5

# Calculate precision
precision = precision_score(test_labels, predicted_labels)

# Calculate F1 score
f1 = f1_score(test_labels, predicted_labels)

# Calculate AUC-ROC
auc_roc = roc_auc_score(test_labels, predictions)

# Calculate confusion matrix
conf_matrix = confusion_matrix(test_labels, predicted_labels)

print("Accuracy:", test_accuracy)
print("Loss:", test_loss)
print("Precision:", precision)
print("F1 Score:", f1)
print("AUC-ROC Score:", auc_roc)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.9621211886405945
Loss: 0.11062055826187134
Precision: 0.9649390243902439
F1 Score: 0.9620060790273556
AUC-ROC Score: 0.9928719008264464
Confusion Matrix:
[[637  23]
 [ 27 633]]


In [81]:
## Inferencing with the new data
from tensorflow.keras.preprocessing import image

dataset_path = "../../dataset/xrays"

non_covid_files = glob.glob(os.path.join(dataset_path, "Noncovid", "pneumonia", "*.jpg"))
covid_files = glob.glob(os.path.join(dataset_path, "Covid", "*.png"))
print(len(non_covid_files), len(covid_files))

for i in range(50):
  new_image = image.load_img(non_covid_files[i], target_size=IMAGE_SIZE)
  new_image_array = image.img_to_array(new_image)
  new_image_array = np.expand_dims(new_image_array, axis=0)
  new_image_array = new_image_array / 255.0  # Normalize pixel values
  y_pred = loaded_model.predict(new_image_array)
  print("predicted: ", round(y_pred[0][0]), "real: ", 0)

for i in range(50):
  new_image = image.load_img(covid_files[i+2000], target_size=IMAGE_SIZE)
  new_image_array = image.img_to_array(new_image)
  new_image_array = np.expand_dims(new_image_array, axis=0)
  new_image_array = new_image_array / 255.0  # Normalize pixel values
  y_pred = loaded_model.predict(new_image_array)
  print("predicted: ", round(y_pred[0][0]), "real: ", 1)


774 2126
predicted:  0 real:  0
predicted:  1 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  1 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
pr

In [76]:
# further inference in new dataset

dataset_path = "../../dataset/COVID-19_Radiography_Dataset/"
# covid_files = glob.glob(os.path.join(dataset_path, "COVID/images", "*.png"))
# covid_files = glob.glob(os.path.join(dataset_path, "Lung_Opacity/images", "*.png"))
covid_files = glob.glob(os.path.join(dataset_path, "Viral Pneumonia/images", "*.png"))
# covid_files = glob.glob(os.path.join(dataset_path, "Normal/images", "*.png"))
print(len(covid_files))
# new_data = []
for i in range(50):
  new_image = image.load_img(covid_files[i], target_size=IMAGE_SIZE)
  new_image_array = image.img_to_array(new_image)
  new_image_array = np.expand_dims(new_image_array, axis=0)
  new_image_array = new_image_array / 255.0  # Normalize pixel values
  y_pred = loaded_model.predict(new_image_array)
  print("predicted: ", round(y_pred[0][0]), "real: ", 0)
  # new_data.append(new_image_array)

# new_data = np.concatenate(new_data)
# y_pred = loaded_model.predict(new_data)
# print(y_pred)

1345
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  1 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  1 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  1 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  1 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predicted:  0 real:  0
predic

In [82]:
loaded_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['input_2[0][0]']             
                                                                                                  
 conv1_conv (Conv2D)         (None, 112, 112, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 conv1_bn (BatchNormalizati  (None, 112, 112, 64)         256       ['conv1_conv[0][0]']          
 on)                                                                                        