### Import Libraries

In [1]:
import os
import numpy as np
from random import shuffle
from tqdm import tqdm
import pickle
import glob

import cv2
# Pandas
import pandas as pd

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.densenet import DenseNet169
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet import ResNet101


### For traning speed, define DATASET_SIZE

In [2]:
dataset_path = "../../dataset/ctscan/3A_images_resized/all"

all_files = []

for i in os.listdir(dataset_path):  # Get all the files from the directory in a two element list. First element is list of file location to covid images and second element is list of file location to non-covid images.
  print(i)
  file1 = glob.glob(os.path.join(dataset_path,i, "*.png"))
  file2 = glob.glob(os.path.join(dataset_path,i, "*.jpg")) # .jpg files are also present.
  file1.extend(file2)  # Only extends when there is .jpg file present
  all_files.append(file1)

Covid19
Normal
Pneumonia


In [3]:
count = 0 # Count to record the ids of files. Each file has a unique ID.
img_size = 224
def get_dataset(files, label, count):
  dataset = []  # List to hold all the dataset. Each element is a dictionary

  for j in tqdm(files):  # Loop over each file location
    data_dict = {}
    data_dict["id"] = count
    data_dict["filepath"] = j
    img = cv2.imread(j)
    img = cv2.resize(img,(img_size,img_size))
    data_dict["image"] = img
    data_dict["label"] = label
    count += 1
    dataset.append(data_dict)
  return dataset, count

In [36]:
neg_dataset, pos_dataset, t_dataset = [], [], []

for i, data in enumerate(all_files[:2]): # 0 - covid, 1 - noncovid, 2 - Phnumonia(exclude)
  print(i)
  if i == 0:
    dataset, count = get_dataset(data, 1, count)
    neg_dataset = dataset
  else:
    dataset, count = get_dataset(data, 0 ,count)
    pos_dataset = dataset
t_dataset = neg_dataset[:2200] + pos_dataset[:2200]

0


100%|█████████████████████████████████████████████████████████████████████████████| 6334/6334 [00:43<00:00, 144.54it/s]


1


100%|█████████████████████████████████████████████████████████████████████████████| 6332/6332 [00:25<00:00, 244.92it/s]


In [37]:
batch_size = 2000

In [38]:
image_only, label_only, id_only, img_name = [], [], [], []
for data in t_dataset:
  image_only.append(data["image"])
  label_only.append(data["label"])
  id_only.append(data['id'])
  img_name.append(data["filepath"].split("/")[-1])
image_only = np.array(image_only)

In [39]:
img_datagen = ImageDataGenerator()
batch_img= img_datagen.flow(image_only, batch_size=batch_size, shuffle=False)

In [40]:
def all_models(img_size, model_sel):
  if model_sel == 1:
    vgg_pre_t = VGG16(input_shape = (img_size, img_size, 3),include_top = False, weights ='imagenet')
    return vgg_pre_t, 25088

  elif model_sel==2:
    resnet_pre_t= ResNet101(input_shape = (img_size, img_size, 3),include_top=False, weights='imagenet')
    return resnet_pre_t, 100352

  elif model_sel==3:
    densenet169_pre_t = DenseNet169(input_shape = (img_size, img_size, 3),include_top = False, weights ='imagenet' )
    return densenet169_pre_t, 81536

In [54]:
select_model = 1 # int(input("Enter the number for: \n 1) VGG16 \n 2) Resnet101  \n 3) Densenet161 "))

In [55]:
all_fea = []
model, feature_size = all_models(img_size, select_model)
for data in tqdm(range(len(batch_img))):
  try:
    features = model.predict(batch_img[data]).flatten().reshape(batch_size, feature_size)
  except:
    img_len = len(batch_img[data])
    features = model.predict(batch_img[data]).flatten().reshape(img_len, feature_size)
  all_fea.extend(features)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]



 33%|████████████████████████████                                                        | 1/3 [00:03<00:07,  3.82s/it]



 67%|████████████████████████████████████████████████████████                            | 2/3 [00:07<00:03,  3.70s/it]



100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.11s/it]


In [56]:
for i in range(len(t_dataset)):
  t_dataset[i]['image']= all_fea[i]

In [57]:
filepath = "../../pickle_files/al/ct_scan/"
if select_model == 1:
  filename = f"ct_scan_resized_{len(t_dataset)}_vgg16.pickle"
elif select_model == 2:
  filename = f"ct_scan_resized_{len(t_dataset)}_resnet101.pickle"
elif select_model == 3:
  filename = f"ct_scan_resized_{len(t_dataset)}_densenet169.pickle"

file = filepath + filename
with open(file, 'wb') as handle:
  pickle.dump(t_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [58]:
print("Total Dataset: {}".format(len(t_dataset)))
print("Sample feature dataset Covid: {}".format(t_dataset[0]))
print("Sample feature dataset Non-Covid: {}".format(t_dataset[2200]))

Total Dataset: 4400
Sample feature dataset Covid: {'id': 12666, 'filepath': '../../dataset/ctscan/3A_images_resized/all\\Covid19\\NCP_1013_2577_0000.png', 'image': array([0.      , 0.      , 0.      , ..., 0.      , 9.833735, 0.      ],
      dtype=float32), 'label': 1}
Sample feature dataset Non-Covid: {'id': 19000, 'filepath': '../../dataset/ctscan/3A_images_resized/all\\Normal\\Normal_1671_793_0000.png', 'image': array([ 0.     ,  0.     , 22.64142, ...,  0.     ,  0.     ,  0.     ],
      dtype=float32), 'label': 0}
