### Import Libraries

In [1]:
import os
import glob
import random
import pickle

from tqdm import tqdm

import numpy as np

# OpenCV
import cv2

# Pandas
import pandas as pd

# Tenforflow
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet import ResNet101
from tensorflow.keras.applications.densenet import DenseNet169
from tensorflow.keras.layers import Flatten

# Scikit-learn
from sklearn.decomposition import PCA

### Read image file paths and store to a list both for covid and non-covid

In [2]:
dataset_path = "../../dataset/xray/resized"

all_files = []

covid_files = glob.glob(os.path.join(dataset_path, "Covid", "*.png"))
covid_files.extend(glob.glob(os.path.join(dataset_path, "Covid", "*.jpg")))

non_covid_files = glob.glob(os.path.join(dataset_path, "Noncovid", "*.png"))
non_covid_files.extend(glob.glob(os.path.join(dataset_path, "Noncovid", "*.jpg")))

# Get all the files from the directory in a two element list.
# First element is list of file location to covid images and second element is list of file location to non-covid images.
all_files = [non_covid_files, covid_files]
print("Image Files Count\nNon-Covid: {}\nCovid: {}\nTotal: {}".format(len(non_covid_files), len(covid_files), len(all_files[0] + all_files[1])))

Image Files Count
Non-Covid: 2358
Covid: 2124
Total: 4482


### Define Data Size

In [3]:
non_covid_count = 2300
covid_count = 2100

### Function to read image from file list and store the corresponding label

In [4]:
def get_dataset(files, label, data_count, img_size, image_id_count):
  dataset = []  # List to hold all the dataset. Each element is a dictionary

  count = 1
  for j in tqdm(files):  # Loop over each file location
    data_dict = {}
    data_dict["id"] = image_id_count
    data_dict["filepath"] = j
    try:
      img = cv2.imread(j)
      img = cv2.resize(img, (img_size, img_size))
      data_dict["image"] = img
      data_dict["label"] = label
      dataset.append(data_dict)
      if count == data_count:
        break
      count += 1
      image_id_count += 1
    except Exception as e:
      print("faulty image: {} {}".format(j, e))
  return dataset, image_id_count

### Read image from the file and store the corresponding label in a list

In [5]:
c_dataset, nc_dataset, t_dataset = [], [], []
image_id_count = 1       # counter to record the ids of files. Each file has a unique ID.
img_size = 224
# all_files => [NC, C]
for i, data in enumerate(all_files): # only two loops for Covid and Non-Covid
  if i == 0:
    nc_dataset, image_id_count = get_dataset(data, i, non_covid_count, img_size, image_id_count)
  else:
    c_dataset, image_id_count = get_dataset(data, i, covid_count, img_size, image_id_count)
tot_dataset = nc_dataset + c_dataset
print("Dataset Count\nNon-Covid: {}\nCovid: {}\nTotal: {}".format(len(nc_dataset), len(c_dataset), len(tot_dataset)))

 97%|███████████████████████████████████████████████████████████████████████████  | 2299/2358 [00:04<00:00, 506.26it/s]
 99%|████████████████████████████████████████████████████████████████████████████ | 2099/2124 [00:07<00:00, 293.70it/s]

Dataset Count
Non-Covid: 2300
Covid: 2100
Total: 4400





### Extract image only from the dataset to send to DNN

In [6]:
image_only = []
for data in tot_dataset:
  image_only.append(data["image"])
image_only = np.array(image_only)

### Define batch size

In [7]:
batch_size = image_only.shape[0]
print(batch_size)

4400


### Generate batches of images to feed into DNN

In [8]:
img_datagen = ImageDataGenerator()
batch_img = img_datagen.flow(image_only, batch_size=batch_size, shuffle=False)

### Define a function to select a model from three (VGG16, ResNet101 and DenseNet169)

In [9]:
def all_models(img_size, model_sel):
  if model_sel == 1:
    vgg_pre_t = VGG16(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet')
    return vgg_pre_t, 25088

  elif model_sel == 2:
    resnet_pre_t= ResNet101(input_shape=(img_size, img_size, 3), include_top=False, weights='imagenet')
    return resnet_pre_t, 100352

  elif model_sel == 3:
    densenet169_pre_t = DenseNet169(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet' )
    return densenet169_pre_t, 81536

### Select model among 3

In [21]:
select_model = 3 # int(input("Enter the number for: \n 1) VGG16 \n 2) Resnet101  \n 3) Densenet169 "))

### Extract image feature from the selected DNN model

In [22]:
all_features, reduced_features = [], []
pca = PCA(n_components=batch_size)

all_feat = []
model, feature_size = all_models(img_size, select_model)
for data in tqdm(range(len(batch_img))):
  try:
    features = model.predict(batch_img[data]).flatten().reshape(batch_size, feature_size)
    feature_matrix = features.reshape(features.shape[0], -1)
    reduced_features = pca.fit_transform(feature_matrix)
  except:
    img_len = len(batch_img[data])
    features = model.predict(batch_img[data]).flatten().reshape(img_len, feature_size)
    feature_matrix = features.reshape(features.shape[0], -1)
    reduced_features = pca.fit_transform(feature_matrix)
  all_feat.extend(reduced_features)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]



100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [03:27<00:00, 207.70s/it]


### Replace image value by image features

In [23]:
for i in range(len(tot_dataset)):
  tot_dataset[i]['image'] = all_feat[i]

# print sample dataset covid and non-covid
print(len(tot_dataset))
print(tot_dataset[0])
print(tot_dataset[2300])

4400
{'id': 1, 'filepath': '../../dataset/xray/resized\\Noncovid\\NORMAL(0).png', 'image': array([-5.6118225e+02,  7.0265884e+01,  4.3234432e+01, ...,
        2.3570881e-11, -6.2149413e-12,  1.7588528e-11], dtype=float32), 'label': 0}
{'id': 2300, 'filepath': '../../dataset/xray/resized\\Covid\\071d06607edf81d70c940e043bce34_jumbo.png', 'image': array([-4.3964252e+01, -1.9030055e+02,  2.5658374e+02, ...,
        2.3236680e-11, -4.1524570e-12,  1.7088119e-11], dtype=float32), 'label': 1}


### Save extracted feature in pickle file for future use

In [24]:
filepath = "../../pickle_files/al/x_ray/"
if select_model == 1:
  filename = "x_ray_pca_vgg16.pickle"
elif select_model == 2:
  filename = "x_ray_pca_resnet101.pickle"
elif select_model == 3:
  filename = "x_ray_pca_densenet169.pickle"

file = filepath + filename
with open(file, 'wb') as handle:
  pickle.dump(tot_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

# ----------------------------------------------------------------------------------