### Import Libraries

In [1]:
import numpy as np
import os
from random import shuffle
import random
from tqdm import tqdm
import pickle
import glob

# OpenCV
import cv2

# Pandas
import pandas as pd

# Tensorflow
from tensorflow.keras.applications.densenet import DenseNet169
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet import ResNet101
from tensorflow.keras.preprocessing.image import ImageDataGenerator

### Read image file paths and store to a list both for covid and non-covid

In [18]:
dataset_path = "../../dataset/xray/resized"

all_files = []

covid_files = glob.glob(os.path.join(dataset_path, "Covid", "*.png"))
covid_files.extend(glob.glob(os.path.join(dataset_path, "Covid", "*.jpg")))

non_covid_files = glob.glob(os.path.join(dataset_path, "Noncovid", "*.png"))
non_covid_files.extend(glob.glob(os.path.join(dataset_path, "Noncovid", "*.jpg")))

# Get all the files from the directory in a two element list. First element is list of file location to covid images and second element is list of file location to non-covid images.
all_files = [covid_files, non_covid_files]
print("Image Files Count\nCovid: {}\nNon-Covid: {}\nTotal: {}".format(len(covid_files), len(non_covid_files), len(all_files[0] + all_files[1])))

Image Files Count
Covid: 2124
Non-Covid: 2358
Total: 4482


### Function to read image from file list and store the corresponding label

In [19]:
def get_dataset(files, label, count, img_size):
  dataset = []  # List to hold all the dataset. Each element is a dictionary

  for j in tqdm(files):  # Loop over each file location
    data_dict = {}
    data_dict["id"] = count
    data_dict["filepath"] = j
    try:
      img = cv2.imread(j)
      img = cv2.resize(img, (img_size, img_size))
      data_dict["image"] = img
      data_dict["label"] = label
      count += 1
      dataset.append(data_dict)
    except Exception as e:
      print("faulty image: {} {}".format(j, e))
  return dataset, count

### Read image from the file and store the corresponding label in a list

In [20]:
c_dataset, nc_dataset, t_dataset = [], [], []
labels = [1, 0] # 1 = Covid, 0 = Noncovid
count = 0       # Count to record the ids of files. Each file has a unique ID.
img_size = 224
# all_files => [NC, C]
for i, data in enumerate(all_files): # only two loops for Covid and Non-Covid
  dataset, count = get_dataset(data, labels[i], count, img_size)
  if labels[i] == 1:
    c_dataset = dataset
  else:
    nc_dataset = dataset
tot_dataset = c_dataset + nc_dataset
print("Dataset Count\nCovid: {}\nNon-Covid: {}\nTotal: {}".format(len(c_dataset), len(nc_dataset), len(tot_dataset)))

100%|█████████████████████████████████████████████████████████████████████████████| 2124/2124 [00:06<00:00, 308.21it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2358/2358 [00:39<00:00, 59.15it/s]

Dataset Count
Covid: 2124
Non-Covid: 2358
Total: 4482





### Extract image only from the dataset to send to DNN

In [21]:
image_only = []
for data in tot_dataset:
  image_only.append(data["image"])
image_only = np.array(image_only)

### Define batch size

In [53]:
batch_size = image_only.shape[0]
print(batch_size)

4482


### Generate batches of images to feed into DNN

In [46]:
img_datagen = ImageDataGenerator()
batch_img = img_datagen.flow(image_only, batch_size=batch_size, shuffle=False)

### Define a function to select a model from three (VGG16, ResNet101 and DenseNet169)

In [47]:
def all_models(img_size, model_sel):
  if model_sel == 1:
    vgg_pre_t = VGG16(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet')
    return vgg_pre_t, 25088

  elif model_sel == 2:
    resnet_pre_t= ResNet101(input_shape=(img_size, img_size, 3), include_top=False, weights='imagenet')
    return resnet_pre_t, 100352

  elif model_sel == 3:
    densenet169_pre_t = DenseNet169(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet' )
    return densenet169_pre_t, 81536

### Select model among 3

In [48]:
select_model = 1 # int(input("Enter the number for: \n 1) VGGNET16 \n 2) Resnet101  \n 3) Densenet169 "))

### Extract image feature from the selected DNN model

In [49]:
from sklearn.decomposition import PCA

all_features, reduced_features = [], []
pca = PCA(n_components=batch_size)

all_feat = []
model, feature_size = all_models(img_size, select_model)
for data in tqdm(range(len(batch_img))):
  try:
    features = model.predict(batch_img[data]).flatten().reshape(batch_size, feature_size)
    feature_matrix = features.reshape(features.shape[0], -1)
    reduced_features = pca.fit_transform(feature_matrix)
  except:
    img_len = len(batch_img[data])
    features = model.predict(batch_img[data]).flatten().reshape(img_len, feature_size)
    feature_matrix = features.reshape(features.shape[0], -1)
    reduced_features = pca.fit_transform(feature_matrix)
  all_feat.extend(reduced_features)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:22<00:00, 82.25s/it]


### Replace image value by image features

In [50]:
for i in range(len(tot_dataset)):
  tot_dataset[i]['image'] = all_feat[i]

# print sample dataset covid and non-covid
print(dataset[0])
print(dataset[2125])

{'id': 2124, 'filepath': '../../dataset/xray/resized\\Noncovid\\NORMAL(0).png', 'image': array([-3.5238689e+01,  2.8131027e+02, -9.3958908e+01, ...,
        7.2593986e-11,  1.7949255e-10,  7.5264801e-11], dtype=float32), 'label': 0}
{'id': 4249, 'filepath': '../../dataset/xray/resized\\Noncovid\\PNEUMONIA(587).jpg', 'image': array([ 1.6487152e+02, -2.3274074e+02, -3.2769183e+02, ...,
        7.2500873e-11,  1.7953106e-10,  7.4546008e-11], dtype=float32), 'label': 0}


### Save extracted feature in pickle file for future use

In [51]:
filepath = "../../pickle_files/al/x_ray/"
if select_model == 1:
  filename = "x_ray_pca_vgg16.pickle"
elif select_model == 2:
  filename = "x_ray_resnet101.pickle"
elif select_model == 3:
  filename = "x_ray_densenet169.pickle"

file = filepath + filename
with open(file, 'wb') as handle:
  pickle.dump(tot_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)