### Import Libraries

In [2]:
import numpy as np
import os
from random import shuffle
import random
from tqdm import tqdm
import pickle

# OpenCV
import cv2

# Pandas
import pandas as pd

# Tensorflow
from tensorflow.keras.applications.densenet import DenseNet169
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet import ResNet101
from tensorflow.keras.preprocessing.image import ImageDataGenerator

### Read txt file which includes all file list

In [3]:
data = pd.read_csv("../../dataset/ctscan/all_image_paths.txt", sep=" ", header=None)
data.columns = ['filename', 'label', 'xmin','ymin','xmax','ymax']
print(data)

                              filename  label  xmin  ymin  xmax  ymax
0                 NCP_96_1328_0032.png      2     9    94   512   405
1                 NCP_96_1328_0035.png      2    10   106   512   405
2                 NCP_96_1328_0036.png      2    10   105   512   406
3                 NCP_96_1328_0037.png      2    11   104   512   406
4                 NCP_96_1328_0038.png      2    11   103   512   406
...                                ...    ...   ...   ...   ...   ...
425019  COVIDCTMD-normal075-IM0155.png      0    18    56   496   429
425020  COVIDCTMD-normal075-IM0156.png      0    18    56   496   429
425021  COVIDCTMD-normal075-IM0157.png      0    18    56   496   429
425022  COVIDCTMD-normal075-IM0158.png      0    19    56   495   429
425023  COVIDCTMD-normal075-IM0159.png      0    19    56   495   429

[425024 rows x 6 columns]


### Collect id of different class

In [20]:
nor_id, pne_id, cov_id = [], [], [] # Normal, Pneumonia and Covid Ids List
id_count = 0

for i in range(len(data)):
  # Normal
  if data["label"][i] == 0:
    nor_id.append(i)
  # Pneumonia
  elif data["label"][i] == 1:
    pne_id.append(i)
  # Covid
  else:
    cov_id.append(i)

### Choose random files and make balanced dataset

In [21]:
random.seed(42)
shuffle(nor_id)
shuffle(pne_id)
shuffle(cov_id)

cov_select = cov_id[:5000] # Covid
nor_select = nor_id[:2500] # Normal
pne_select = pne_id[:2500] # Pneumonia

### Define Image Size

In [None]:
img_size = 224

### Define function to compose dataset

In [22]:
def get_dataset(files, cov_select, nor_select, pne_select):
  count = 0 # Count to record the ids of files. Each file has a unique ID.
  dataset = []  # List to hold all the dataset. Each element is a dictionary
  for i in tqdm(cov_select + nor_select + pne_select):  # Loop over each file location
    data_dict = {}
    data_dict["id"] = count
    data_dict["filepath"] = os.path.join("../dataset/ct_scan/3A_images/", files["filename"][i])
    img = cv2.imread(os.path.join("../dataset/ct_scan/3A_images/", files["filename"][i]))

    x_min, y_min, x_max, y_max = files["xmin"][i], files["ymin"][i], files["xmax"][i], files["ymax"][i]
    cropped_img = img[y_min:y_max, x_min:x_max, :]

    img_resized = cv2.resize(cropped_img, (img_size, img_size))
    data_dict["image"] = img_resized

    if files["label"][i] == 0 or files["label"][i] == 1:
      data_dict["label"] = 0 # Non-Covid
    else:
      data_dict["label"] = 1 # Covid
    count += 1
    dataset.append(data_dict)
  return dataset

In [24]:
dataset = get_dataset(data, cov_select, nor_select, pne_select)

### Extract image only from the dataset to send to DNN

In [10]:
image_only = []
for data in dataset:
    image_only.append(data["image"])

image_only = np.array(image_only)

### Define batch size

In [None]:
batch_size = 2000

### Generate batches of images to feed into DNN

In [11]:
img_datagen = ImageDataGenerator()
batch_img = img_datagen.flow(image_only, batch_size=batch_size, shuffle=False)

### Define a function to select a model from three (VGGNET16, ResNet101 and DenseNet169)

In [12]:
def all_models(img_size, model_sel):
  if model_sel == 1:
    vgg_pre_t = VGG16(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet')
    return vgg_pre_t, 25088

  elif model_sel == 2:
    resnet_pre_t= ResNet101(input_shape=(img_size, img_size, 3), include_top=False, weights='imagenet')
    return resnet_pre_t, 100352

  elif model_sel == 3:
    densenet169_pre_t = DenseNet169(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet' )
    return densenet169_pre_t, 81536

### Select Model

In [None]:
select_model = 2 #int(input("Enter the number for: \n 1) VGGNET16 \n 2) Resnet101  \n 3) Densenet169 "))

### Extract image feature from the selected DNN model

In [None]:
all_feat = []
model, feature_size = all_models(img_size, select_model)
for data in tqdm(range(len(batch_img))):
  try:
    features = model.predict(batch_img[data]).flatten().reshape(batch_size, feature_size)
  except:
    img_len = len(batch_img[data])
    features = model.predict(batch_img[data]).flatten().reshape(img_len, feature_size)
  all_feat.extend(features)

### Replace image value by image features

In [14]:
for i in range(len(dataset)):
  ft_dataset[i]['image'] = all_feat[i]

# print sample dataset covid and non-covid
print(ft_dataset[0])
print(ft_dataset[5001])

### Save extracted feature in pickle file for future use

In [None]:
filepath = "../../pickle_files/al/ct_scan/"
if select_model == 1:
  filename = "ct_scan_vggnet16.pickle"
elif select_model == 2:
  filename = "ct_scan_resnet101.pickle"
elif select_model == 3:
  filename = "ct_scan_densenet169.pickle"

file = filepath + filename
with open(file, 'wb') as handle:
  pickle.dump(ft_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)