#           CoronaHack - fastai

About this Notebook:

Using Coronahack dataset plus some extras, training different models with fastai to predict COVID-19 using X-ray images

[View Writeup Here](https://drive.google.com/file/d/1kFZkyYo3IgvvJKnicMRSeNoLcODIrBOi/view?usp=sharing)

## Importing necessary libraries

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt

import pandas as pd
import shutil
import os
import imageio
import sys
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image, ImageOps
import scipy.ndimage as ndi

from fastai.vision import *
from fastai.metrics import error_rate

In [None]:
import os
print(os.listdir("../input/coronahack-chest-xraydataset/"))
print(os.listdir("../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/"))

In [None]:
os.mkdir("/kaggle/corona_check")
os.mkdir("/kaggle/corona_check/train")
os.mkdir("/kaggle/corona_check/test")
os.mkdir("/kaggle/corona_check/train/Normal/")
os.mkdir("/kaggle/corona_check/train/COVID19/")
os.mkdir("/kaggle/corona_check/test/Normal/")
os.mkdir("/kaggle/corona_check/test/COVID19/")


In [None]:
print(os.listdir("/kaggle/corona_check/train/Normal"))
print(os.listdir("/kaggle/corona_check/test/Normal"))

In [None]:
!cd /kaggle/corona_check/train/
!rm /kaggle/corona_check/train/*.jpeg

!cd /kaggle/corona_check/test/
!rm /kaggle/corona_check/test/*.jpeg

In [None]:
def copy_img(src_path,dst_path):
    try:
        shutil.copy(src_path, dst_path)
        stmt ='File Copied'
    except IOError as e:
        print('Unable to copy file {} to {}'
              .format(src_path, dst_path))
        stmt ='Copy Failed - IO Error'
    except:
        print('When try copy file {} to {}, unexpected error: {}'
              .format(src_path, dst_path, sys.exc_info()))
        stmt ='Copy Failed - other Error'+ sys.exc_info()
        
    return stmt 

In [None]:
data_dir="../input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/"
metadata_path="../input/coronahack-chest-xraydataset/"

In [None]:
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')

In [None]:
meta_data = pd.read_csv(metadata_path+'Chest_xray_Corona_Metadata.csv')
meta_data.head()

In [None]:
meta_data['File_path']=''
meta_data.loc[meta_data['Dataset_type']=='TRAIN','File_path']=train_dir+'/'
meta_data.loc[meta_data['Dataset_type']=='TEST','File_path']=test_dir+'/'

In [None]:
meta_data['X_ray_img_nm_path']=meta_data['File_path']+meta_data['X_ray_image_name']

**Import Metadata information**

In [None]:
meta_data.head()

## Preparing the dataset 

### Dataset contains variety of latest for ease of classification first we will catagorise Normal / Healthy & COVID 

In [None]:
meta_COVID_19_train = meta_data[(meta_data['Dataset_type']=='TRAIN') & 
                        ((meta_data['Label']=='Normal')|(meta_data['Label']=='Pnemonia') & (meta_data['Label_2_Virus_category']=='COVID-19'))]


meta_COVID_19_test = meta_data[(meta_data['Dataset_type']=='TEST') & 
                        ((meta_data['Label']=='Normal')|(meta_data['Label']=='Pnemonia') & (meta_data['Label_2_Virus_category']=='COVID-19'))]


## Moving the 10 Corona Infected dataset to Test

meta_data_covid_test = meta_data[meta_data['Label_2_Virus_category']=='COVID-19'].sample(12)
meta_COVID_19_train = meta_COVID_19_train[~meta_COVID_19_train['X_ray_image_name'].isin(meta_data_covid_test['X_ray_image_name'])]
meta_COVID_19_test_fnl = pd.concat([meta_data_covid_test,meta_COVID_19_test],ignore_index=False)

In [None]:
meta_COVID_19_train.loc[meta_COVID_19_train['Label'] =='Pnemonia','Label']='COVID19'
meta_COVID_19_test_fnl.loc[meta_COVID_19_test_fnl['Label'] =='Pnemonia','Label']='COVID19'

In [None]:
print("===============Train Set==========================\n")
print(meta_COVID_19_train.groupby(['Label']).agg({'Dataset_type':'count'}))

print("\n===============Test Set==========================\n")
print(meta_COVID_19_test_fnl.groupby(['Label']).agg({'Dataset_type':'count'}))

In [None]:
meta_COVID_19_train['Img_tgt_path']="/kaggle/corona_check/train/"
meta_COVID_19_test_fnl['Img_tgt_path']="/kaggle/corona_check/test/"

In [None]:
meta_COVID_19_train.loc[meta_COVID_19_train['Label']=='Normal','Img_tgt_path']=meta_COVID_19_train['Img_tgt_path']+'Normal/'
meta_COVID_19_train.loc[meta_COVID_19_train['Label']=='COVID19','Img_tgt_path']=meta_COVID_19_train['Img_tgt_path']+'COVID19/'

meta_COVID_19_test_fnl.loc[meta_COVID_19_test_fnl['Label']=='Normal','Img_tgt_path']=meta_COVID_19_test_fnl['Img_tgt_path']+'Normal/'
meta_COVID_19_test_fnl.loc[meta_COVID_19_test_fnl['Label']=='COVID19','Img_tgt_path']=meta_COVID_19_test_fnl['Img_tgt_path']+'COVID19/'

> ## Moving the file to seperate path for COVID19 Classification 

In [None]:
meta_COVID_19_train['Move_status'] = np.vectorize(copy_img)(meta_COVID_19_train['X_ray_img_nm_path'],meta_COVID_19_train['Img_tgt_path'])
meta_COVID_19_test_fnl['Move_status'] = np.vectorize(copy_img)(meta_COVID_19_test_fnl['X_ray_img_nm_path'],meta_COVID_19_test_fnl['Img_tgt_path'])

## Adding additional COVID XRAYS

In [None]:
xray_dir = "../input/covidxray/"
xray_names = []
for i in range(1,73):
    name = str(i) + ".jpeg" #name of each picture
    xray_names.append(name)

data = {'name':xray_names}    

xraydf = pd.DataFrame(data)
xraydf['from'] = xray_dir + xraydf['name'] #add the input folder to each picture name to get location of each picture
xraydf['to'] = ''
xraydf['to'].iloc[:50] = "/kaggle/corona_check/train/COVID19/" + xraydf['name'].iloc[:50] #send 50 images to training folder
xraydf['to'].iloc[50:] = "/kaggle/corona_check/test/COVID19/" + xraydf['name'].iloc[50:] #send the rest to test folder


xraydf['move_status'] = np.vectorize(copy_img)(xraydf['from'],xraydf['to'])

# Exploratory Data Analysis

In [None]:
dirname = '/kaggle/corona_check/'
train_path = os.path.join(dirname, 'train/')
train_nrml_pth = os.path.join(train_path, 'Normal/')
train_covid19_pth = os.path.join(train_path, 'COVID19/')

test_path = os.path.join(dirname, 'test/')
test_nrml_pth = os.path.join(train_path, 'Normal/')
test_covid19_pth = os.path.join(train_path, 'COVID19/')

In [None]:
def plot_imgs(item_dir, num_imgs=25):
    all_item_dirs = os.listdir(item_dir)
    item_files = [os.path.join(item_dir, file) for file in all_item_dirs][:num_imgs]

    plt.figure(figsize=(10, 10))
    for idx, img_path in enumerate(item_files):
        plt.subplot(5, 5, idx+1)

        img = plt.imread(img_path)
        plt.imshow(img)

    plt.tight_layout()

In [None]:
plot_imgs(train_nrml_pth)

In [None]:
plot_imgs(train_covid19_pth)

## Histogram with Mathplotlib

In [None]:
def plot_img_hist(item_dir, num_img=6):
  all_item_dirs = os.listdir(item_dir)
  item_files = [os.path.join(item_dir, file) for file in all_item_dirs][:num_img]
  
  #plt.figure(figsize=(10, 10))
  for idx, img_path in enumerate(item_files):
    fig1 = plt.figure(idx,figsize=(10, 10))
    fig1.add_subplot(2, 2, 1)
    img = mpimg.imread(img_path, )
    plt.imshow(img)
    fig1.add_subplot(2, 2, 2)
    plt.hist(img.ravel(),bins=256, fc='k', ec='k')
  
  plt.tight_layout()

In [None]:
plot_img_hist(train_nrml_pth,3)

In [None]:
plot_img_hist(train_covid19_pth,3)

In [None]:
def plot_img_hist_ndi(item_dir, num_img=6):
  all_item_dirs = os.listdir(item_dir)
  item_files = [os.path.join(item_dir, file) for file in all_item_dirs][:num_img]
  
  #plt.figure(figsize=(10, 10))
  for idx, img_path in enumerate(item_files):
    im = imageio.imread(img_path)
    hist = ndi.histogram(im, min=0, max=255, bins=256)
    cdf = hist.cumsum() / hist.sum()
    
    fig1 = plt.figure(idx,figsize=(10, 10))
    fig1.add_subplot(2, 3, 1)
    img = mpimg.imread(img_path, )
    plt.title("No. {}".format(idx))
    plt.imshow(img)
    fig1.add_subplot(2, 3, 2)
    plt.title("Histogram")
    plt.plot(hist)
    fig1.add_subplot(2, 3, 3)
    plt.title("CDF")
    plt.plot(cdf)

  plt.tight_layout()

In [None]:
plot_img_hist_ndi(train_nrml_pth,2)

In [None]:
plot_img_hist_ndi(train_covid19_pth,2)

# fastai
We use the fastai API, using the RESNET34, RESNET101, vgg16_bn and vgg19_bn models

In [None]:
fastdata = ImageDataBunch.from_folder('/kaggle/corona_check/', ds_tfms=get_transforms(), size=224, bs=64, valid = 'test').normalize(imagenet_stats)
fastdata.show_batch(rows=3, figsize=(7, 8))

## RESNET

First we initialize the model and make sure it's using the GPU

In [None]:
learn34 = cnn_learner(fastdata, models.resnet34, metrics=error_rate)
learn101 = cnn_learner(fastdata, models.resnet101, metrics=error_rate)


In [None]:
defaults.device = torch.device('cuda')
torch.backends.cudnn.enabled

By default, only the fully conneted layers at the top are unfrozen

In [None]:
learn34.fit_one_cycle(4)

In [None]:
learn101.fit_one_cycle(4)

Now we try and find the optimal learning rates for the other layers

In [None]:
learn34.unfreeze() # must be done before calling lr_find
learn34.lr_find()
learn34.recorder.plot()

In [None]:
learn101.unfreeze() # must be done before calling lr_find
learn101.lr_find()
learn101.recorder.plot()

In [None]:
learn34.fit_one_cycle(4, max_lr=slice(1e-5,1e-4))

In [None]:
learn101.fit_one_cycle(4, max_lr=slice(1e-5,1e-4))

You can save the learned model in save states

In [None]:
learn34.save('covid-detection-resnet-34')
learn101.save('covid-detection-resnet-101')

You can show the images with the highest loss, and choose whether to delete them or not.

In [None]:
from fastai.widgets import *

In [None]:
ds, idxs = DatasetFormatter().from_toplosses(learn34)
ImageCleaner(ds, idxs, '/kaggle/corona_check/')

In [None]:
ds, idxs = DatasetFormatter().from_toplosses(learn101)
ImageCleaner(ds, idxs, '/kaggle/corona_check/')

In [None]:
interp34 = ClassificationInterpretation.from_learner(learn34)
interp34.plot_confusion_matrix()

In [None]:
interp101 = ClassificationInterpretation.from_learner(learn101)
interp101.plot_confusion_matrix()

Show the images with the highest loss (most of these seem to be pictures of babies?)

In [None]:
interp34.plot_top_losses(9, figsize=(15,15))

In [None]:
interp101.plot_top_losses(9, figsize=(15,15))

## vgg

In [None]:
learn16 = cnn_learner(fastdata, models.vgg16_bn, metrics=error_rate)
learn19 = cnn_learner(fastdata, models.vgg19_bn, metrics=error_rate)

In [None]:
learn16.fit_one_cycle(4)

In [None]:
learn19.fit_one_cycle(4)

In [None]:
learn16.unfreeze() # must be done before calling lr_find
learn16.lr_find()
learn16.recorder.plot()

In [None]:
learn19.unfreeze() # must be done before calling lr_find
learn19.lr_find()
learn19.recorder.plot()

In [None]:
learn16.fit_one_cycle(4, max_lr=slice(1e-4,1e-3))

In [None]:
learn19.fit_one_cycle(4, max_lr=slice(1e-4,1e-3))

In [None]:
learn16.save('covid-detection-vgg-16')

In [None]:
learn19.save('covid-detection-vgg-19')

In [None]:
interp16 = ClassificationInterpretation.from_learner(learn16)
interp16.plot_confusion_matrix()

In [None]:
interp19 = ClassificationInterpretation.from_learner(learn19)
interp19.plot_confusion_matrix()

In [None]:
interp16.plot_top_losses(9, figsize=(15,15))

In [None]:
interp19.plot_top_losses(9, figsize=(15,15))