In [None]:
import glob
import numpy as np
from keras.preprocessing.image import load_img,img_to_array
import os
import matplotlib.pyplot as plt
import cv2
import pandas as pd
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.applications.vgg16 import VGG16
from keras.layers import Flatten, Dense, Conv2D, MaxPooling2D, Input, Dropout, merge, UpSampling2D, Input
from keras.models import Model, Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam

In [None]:
IMAGE_SIZE = (224,224)
DIR_NAME = '/kaggle/input/covid-chest-xray/'
IMAGE_DIR = DIR_NAME + 'images/'
ANNOTATIONS_DIR = DIR_NAME + 'annotations/'

In [None]:
for dirname, _, filenames in os.walk(DIR_NAME):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv(DIR_NAME + 'metadata.csv')
df.head()

In [None]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
df.shape

In [None]:
len(os.listdir(IMAGE_DIR))

In [None]:
df.columns

In [None]:
def count_na(df, col):
    return df[col].isna().sum()

In [None]:
count_na(df,'filename')

In [None]:
files = os.listdir(IMAGE_DIR)
files

In [None]:
check_file_present = []

In [None]:
for f in df['filename']:
    check_file_present.append(f in files)

In [None]:
len(check_file_present)

In [None]:
c = 0
for cfp in check_file_present:
    if cfp:
        c+=1
c, len(check_file_present)-c

In [None]:
df = df[check_file_present]
df.shape

In [None]:
df =df.drop(['patientid', 'offset', 'survival', 'intubated',
       'intubation_present', 'went_icu', 'in_icu', 'needed_supplemental_O2',
       'extubated', 'temperature', 'pO2_saturation', 'leukocyte_count',
       'neutrophil_count', 'lymphocyte_count', 'date', 'doi', 'license', 'other_notes', 'Unnamed: 28'], axis=1)

In [None]:
df.head()

In [None]:
df = df.drop(['url', 'folder'], axis=1)

In [None]:
df.head()

In [None]:
count_na(df,'finding')

In [None]:
df['finding'] = df['finding'].astype('category')

In [None]:
df['label'] = df['finding'].cat.codes

In [None]:
df.head(20)

In [None]:
finding_to_label = {}
label_to_finding = {}

In [None]:
for _,row in df.iterrows():
    finding_to_label[row['finding']] = row['label']
    label_to_finding[row['label']] = row['finding']

In [None]:
finding_to_label

In [None]:
label_to_finding

In [None]:
df = df.drop(['finding'], axis=1)

In [None]:
count_na(df,'sex')

In [None]:
df['view'] = df['view'].astype('category')
df['modality'] = df['modality'].astype('category')

In [None]:
df['view'] = df['view'].cat.codes
df['modality'] = df['modality'].cat.codes

In [None]:
df.head()

In [None]:
df['sex'] = df['sex'].fillna('M')
df['sex']

In [None]:
count_na(df,'sex')

In [None]:
df['sex'] = df['sex'].astype('category').cat.codes
df.head()

In [None]:
df = df.drop(['location'], axis=1)

In [None]:
df.head()

In [None]:
count_na(df,'clinical_notes')

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
df['clinical_notes'] = df['clinical_notes'].fillna("")

In [None]:
df['clinical_notes'].values

In [None]:
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(np.array(df['clinical_notes'].values))

In [None]:
integer_encoded.shape

In [None]:
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
onehot_encoded

In [None]:
onehot_encoded.shape

In [None]:
# invert first example
inverted = label_encoder.inverse_transform([np.argmax(onehot_encoded[0, :])])
print(inverted)

In [None]:
X_clinical_notes = onehot_encoded
X_clinical_notes.shape

In [None]:
Y = df['label'].values
Y.shape

In [None]:
df = df.drop(['clinical_notes'], axis = 1)
df.head()

In [None]:
df = df.drop(['label'], axis = 1)
df.head()

In [None]:
X_sex = df['sex'].values
X_age = df['age'].values
X_view = df['view'].values
X_modality = df['modality'].values
X_sex.shape, X_age.shape, X_modality.shape, X_view.shape

In [None]:
df = df.drop(['sex', 'view', 'age', 'modality'], axis = 1)
df.head()

In [None]:
images = df['filename'].values

In [None]:
X_images = []

In [None]:
for i in range(0,len(images)):
    imgName = images[i]
    oriimg = cv2.imread(IMAGE_DIR+imgName)
    img = cv2.resize(oriimg, IMAGE_SIZE)
    print(i,img.shape)
    X_images.append(img)
X_images = np.array(X_images)
X_images.shape

In [None]:
X_sex = np.reshape(X_sex, (X_sex.shape[0],1))
X_age = np.reshape(X_age, (X_age.shape[0],1))
X_view = np.reshape(X_view, (X_view.shape[0],1))
X_modality = np.reshape(X_modality, (X_modality.shape[0],1))

In [None]:
X_images.shape,X_clinical_notes.shape,X_sex.shape,X_age.shape,X_view.shape,X_modality.shape,Y.shape

In [None]:
HEIGHT=224
WIDTH=224
CHANNEL=3

In [None]:
import math
split_num = math.ceil(len(X_images)*0.3)
split_num

In [None]:
X_non_image_data = np.concatenate((X_clinical_notes,X_sex,X_age,X_view,X_modality), axis=1)
X_non_image_data.shape

In [None]:
X_images_train = X_images[split_num:]
X_images_test = X_images[:split_num]
X_images_train.shape, X_images_test.shape

In [None]:
X_non_image_data_train = X_non_image_data[split_num:]
X_non_image_data_test = X_non_image_data[:split_num]
X_non_image_data_train.shape, X_non_image_data_test.shape

In [None]:
y_train = Y[split_num:]
y_test = Y[:split_num:]
y_train.shape, y_test.shape

In [None]:
#encoder
model = Sequential()
model.add(VGG16(weights="imagenet", include_top=False, input_shape=(HEIGHT, WIDTH, CHANNEL)))
model.add(Conv2D(16, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2, 2), padding='same'))
model.add(Conv2D(8, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2, 2), padding='same'))
model.add(Conv2D(8, (3, 3), activation='relu', padding='same'))
model.add(MaxPooling2D((2, 2), padding='same'))

In [None]:
#decoder
model.add(Conv2D(8, (3, 3), activation='relu', padding='same'))
model.add(UpSampling2D((2, 2)))
model.add(Conv2D(8, (3, 3), activation='relu', padding='same'))
model.add(UpSampling2D((2, 2)))
model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(UpSampling2D((2, 2)))
model.add(Conv2D(3, (3, 3), activation='relu'))
model.add(UpSampling2D((4, 4)))
model.add(Conv2D(3, (3, 3), activation='relu', padding='same'))
model.add(UpSampling2D((4, 4)))
model.add(Conv2D(3, (3, 3), activation='relu', padding='same'))
model.add(UpSampling2D((7, 7)))
model.add(Conv2D(3, (3, 3), activation='sigmoid', padding='same'))
model.summary()

In [None]:
model.layers[0].trainable = False
model.summary()

In [None]:
model.compile(optimizer='adadelta', loss='mse')

In [None]:
model.fit(X_images_train,X_images_train, epochs=50,
                batch_size=35, validation_data=(X_images_test, X_images_test)) 

In [None]:
f = model.predict(X_images_train)

In [None]:
X_images_train1 = f
X_images_train1.shape

In [None]:
X_images_train.shape

In [None]:
X_images_test1 = model.predict(X_images_test)
X_images_test1.shape

In [None]:
X_images_test.shape

In [None]:
X_images_trainf = np.reshape(X_images_train1, (X_images_train1.shape[0], HEIGHT*WIDTH*CHANNEL))
X_images_trainf.shape

In [None]:
X_images_testf = np.reshape(X_images_test1, (X_images_test1.shape[0], HEIGHT*WIDTH*CHANNEL))
X_images_testf.shape

In [None]:
X_non_image_data_train.shape

In [None]:
X_non_image_data_test.shape

In [None]:
X_train = np.concatenate((X_non_image_data_train,X_images_trainf), axis=1)
X_train.shape

In [None]:
X_test = np.concatenate((X_non_image_data_test,X_images_testf), axis=1)
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
np.savez_compressed('dataset.npz',X_train, X_test, y_train, y_test)

In [None]:
finding_to_label

In [None]:
label_to_finding