In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import cv2

In [None]:
from keras.models import Sequential
from keras import optimizers, utils, Input, preprocessing
from keras.utils import np_utils
from keras import datasets
from keras.layers import Conv2D, BatchNormalization, Dense, Activation, Dropout, MaxPooling2D, Flatten 
from keras.layers.experimental.preprocessing import Resizing

In [None]:
df = pd.read_csv('../input/coronahack-chest-xraydataset/Chest_xray_Corona_Metadata.csv', index_col=0)
print(df.shape)
df.head()

Have a look on the data structure

In [None]:
print(list(df))
print(df['Label'].unique())
print(df['Dataset_type'].unique())
print(df['Label_2_Virus_category'].unique())
print(df['Label_1_Virus_category'].unique())

List of all possible values in the dataset

In [None]:
print('Number of NA values in X_ray_image_name = ', df['X_ray_image_name'].isna().sum())
print('Number of NA values in Label = ', df['Label'].isna().sum())
print('Number of NA values in Label_2 = ', df['Label_2_Virus_category'].isna().sum())
print('Number of NA values in Label_1 = ', df['Label_1_Virus_category'].isna().sum())

The amount of nan values in the dataset. 5841 over 5910 of Label_2_Virus_category entries are nan !!

In [None]:
print('Number of Normal values in Label = ', df.Label.value_counts()['Normal'], '-->',
      df.Label.value_counts()['Normal']/df.Label.count(),'%')
print('Number of Pnemonia values in Label = ', df.Label.value_counts()['Pnemonia'], '-->',
      df.Label.value_counts()['Pnemonia']/df.Label.count(),'%')
print('Number of COVID_19 values in Label_2 = ', df.Label_2_Virus_category.value_counts()['COVID-19'])

sub_df = df[(df['Dataset_type']=='TRAIN') & (df['Label']=='Normal')]
print('Number of Normal label in the train dataset = ',len(sub_df.index), '-->',
      len(sub_df.index)/df.Dataset_type.value_counts()['TRAIN'],'%')

sub_df = df[(df['Dataset_type']=='TRAIN') & (df['Label']=='Pnemonia')]
print('Number of Pnemonia label in the train dataset = ',len(sub_df.index), '-->',
      len(sub_df.index)/df.Dataset_type.value_counts()['TRAIN'],'%')

sub_df = df[(df['Dataset_type']=='TEST') & (df['Label']=='Normal')]
print('Number of Normal label in the test dataset = ',len(sub_df.index), '-->',
      len(sub_df.index)/df.Dataset_type.value_counts()['TEST'],'%')

sub_df = df[(df['Dataset_type']=='TEST') & (df['Label']=='Pnemonia')]
print('Number of Pnemonia label in the test dataset = ',len(sub_df.index), '-->',
      len(sub_df.index)/df.Dataset_type.value_counts()['TEST'],'%')

We can observe that we have much more data with label "Pnemonia" than "Normal".

In [None]:
def count_plot_percentage(df, col):
    ax = sns.countplot(x = col, data = df)
    total = len(df[col])
    for p in ax.patches:
        percentage = f'{100 * p.get_height() / total:.1f}%\n'
        x = p.get_x() + p.get_width() / 2
        y = p.get_height()
        ax.annotate(percentage, (x, y), ha='center', va='center')
    plt.tight_layout()
    plt.show()

In [None]:
count_plot_percentage(df, 'Label')
count_plot_percentage(df, 'Label_2_Virus_category')
count_plot_percentage(df, 'Label_1_Virus_category')

In [None]:
# load train data

X_train = []
y_train = []
for _, _, filenames in os.walk('../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/' +
                           'Coronahack-Chest-XRay-Dataset/train/'):
    for filename in filenames:
        row = df.loc[df['X_ray_image_name'] == filename]
        if not row.empty:
            im_path = os.path.join('../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/' +
                               'Coronahack-Chest-XRay-Dataset/train/', filename)
            im = cv2.imread(im_path, cv2.IMREAD_GRAYSCALE)
            im = cv2.resize(im, (256, 256))
            X_train.append(im)
            if row.iloc[0]['Label'] == 'Normal':
                y_train.append(0)
            else:
                y_train.append(1)

X_train = np.array(X_train)
y_train = np.array(y_train)

In [None]:
indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
X_train = X_train[indices]
y_train = y_train[indices]

Load the training data and shuffle it.

In [None]:
# load test data

X_test = []
y_test = []
for _, _, filenames in os.walk('../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/' +
                           'Coronahack-Chest-XRay-Dataset/test/'):
    for filename in filenames:
        row = df.loc[df['X_ray_image_name'] == filename]
        if not row.empty:
            im_path = os.path.join('../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/' +
                               'Coronahack-Chest-XRay-Dataset/test/', filename)
            im = cv2.imread(im_path, cv2.IMREAD_GRAYSCALE)
            im = cv2.resize(im, (256, 256))
            X_test.append(im)
            if row.iloc[0]['Label'] == 'Normal':
                y_test.append(0)
            else:
                y_test.append(1)

X_test = np.array(X_test)
y_test = np.array(y_test)

Load the test data.

In [None]:
print('Shape of X_train = ', X_train.shape)
print('Shape of Y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of Y_test = ', y_test.shape)

In [None]:
%matplotlib inline
# view some images from the train set
f, axes = plt.subplots(2, 8, figsize=(16, 6)) # plt.figure(figsize=(16,4))
x, y = 8, 2
X_0 = X_train[y_train==0]
for i in range(8):  
    axes[0, i].imshow(X_0[i])
    axes[0, i].set_title('0')
    
X_1 = X_train[y_train==1]        
for i in range(8):  
    axes[1, i].imshow(X_1[i])
    axes[1, i].set_title('1')
        
[ax.set_axis_off() for ax in axes.ravel()]
plt.show()

In [None]:
X_train = X_train.reshape(-1,256,256,1)
X_test = X_test.reshape(-1,256,256,1)
input_dim = X_train.shape[1:]
y_train = utils.to_categorical(y_train, 2)
y_test = utils.to_categorical(y_test, 2)

In [None]:
print('Shape of X_train = ', X_train.shape)
print('Shape of Y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of Y_test = ', y_test.shape)

In [None]:
# The model
model = Sequential()
model.add(Input(shape=input_dim))
model.add(Conv2D(16, kernel_size=(3, 3), activation="relu")) #padding="same", 
model.add(Conv2D(16, kernel_size=(3, 3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.4))
model.add(Conv2D(32, kernel_size=(3,3), activation="relu"))
model.add(Conv2D(32, kernel_size=(3,3), activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.4))
model.add(Conv2D(64, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(64, kernel_size=(3,3), padding="same", activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.4))
model.add(Conv2D(64, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(64, kernel_size=(3,3), padding="same", activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Conv2D(128, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(256, kernel_size=(3,3), padding="same", activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(256, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
#
model.add(Flatten())
model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(2, activation="softmax"))

model.summary()

In [None]:
np.random.seed(123)
opt = optimizers.Adam(learning_rate=1e-4)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [None]:
class_weight = {0: 4.,
                1: 1.}
history = model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.05, class_weight=class_weight, verbose=1)

In [None]:
import time
timestr = time.strftime("%Y%m%d_%H%M%S")
filename = 'C:/Users/ferie/corona_hack_'+timestr+'_my_model.hdf5'
print(filename)
model.save(filename)

Save the model.

In [None]:
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Prediction
y_pred = np.argmax(model.predict(X_test), axis=-1)

In [None]:
# Show some predictions
f, axes = plt.subplots(2, 5, figsize=(10, 4))
indices = np.arange(len(y_pred))
np.random.shuffle(indices)
for i, ind in enumerate(indices[:10]):
    label = int(y_pred[ind])
    img = X_test[ind].reshape((256,256))
    axes[i // 5, i % 5].imshow(img, cmap='gray')
    axes[i // 5, i % 5].set_title(label)
        
[ax.set_axis_off() for ax in axes.ravel()]
plt.show()

In [None]:
# Prediction score
score = model.evaluate(X_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
# Categorical accuracy
orig_y_test = np.argmax(y_test, axis=1)
accuracy = sum(orig_y_test == y_pred)/len(orig_y_test)
print('Categorical accuracy = ', accuracy)

In [None]:
# Deeper analysis of the results of predictions
print(np.sum(y_pred==0)/len(y_pred), 'should be ', np.sum(orig_y_test==0)/len(y_pred))
print(np.sum(y_pred==1)/len(y_pred), 'should be ', np.sum(orig_y_test==1)/len(y_pred))