In [None]:
!pip install -q efficientnet >> /dev/null

# Data Preprocessing

Converting the DICOM X-ray images to 512 jpeg images. Creating and saving the dataset with 512x512 images and a merged csv data with study and image level id.

In [None]:
!conda install gdcm -c conda-forge -y

In [None]:
import os
from PIL import Image
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import pandas as pd


In [None]:
import numpy as np 
import pandas as pd 
import os, shutil
from glob import glob
from sklearn.cluster import KMeans
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
import random
tqdm.pandas()

In [None]:
image_level = pd.read_csv("../input/siim-covid19-detection/train_image_level.csv")  
study_level = pd.read_csv("../input/siim-covid19-detection/train_study_level.csv")

study_level['id'] = study_level['id'].str.replace('_study','')
study_level.rename(columns={"id":"StudyID"},inplace=True)
image_level.rename(columns={"StudyInstanceUID":"StudyID"},inplace=True)
merged_df = pd.merge(image_level,study_level,on=['StudyID'])
merged_df['id'] = merged_df['id'].str.replace('_image','')
merged_df.rename(columns={"id":"ImageID"},inplace=True)

In [None]:
def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

In [None]:
for split in ['test', 'train']:
    save_dir = f'/kaggle/tmp/{split}/'

    os.makedirs(save_dir, exist_ok=True)
    
    for dirname, _, filenames in tqdm(os.walk(f'../input/siim-covid19-detection/{split}')):
        for file in filenames:
            # set keep_ratio=True to have original aspect ratio
            xray = read_xray(os.path.join(dirname, file))
            im = resize(xray, size=2048)  
            im.save(os.path.join(save_dir, file.replace('dcm', 'jpg')))
            merged_df.loc[merged_df.index[merged_df['ImageID']==file],"ImagePath"] = os.path.join(save_dir, file.replace('dcm', 'jpg'))
            merged_df.loc[merged_df.index[merged_df['ImageID']==file],"Split"] = split

In [None]:
import os
from tqdm import tqdm
splitarr = []
imageid = []
imagepath = []
for split in ['test']:
    save_dir = f'/kaggle/tmp/{split}/'

    os.makedirs(save_dir, exist_ok=True)
    
    for dirname, _, filenames in tqdm(os.walk(f'../input/siim-covid19-detection/{split}')):
        for file in filenames:
            # set keep_ratio=True to have original aspect ratio
            splitarr.append(split)
            imageid.append(file.replace('.dcm',''))
            imagepath.append(os.path.join('./test', file.replace('dcm', 'jpg')))

In [None]:
import pandas as pd
testdf = pd.DataFrame(columns=['ImageID','Image','Split'])


In [None]:
testdf['ImageID'] = imageid
testdf['Image'] = imagepath
testdf['Split'] = splitarr



In [None]:
testdf.head()

In [None]:
splitdf[splitdf.Split=='train'].shape

In [None]:
merged_df.to_csv("/kaggle/working/merged.csv")

In [None]:
%%time
!tar -zcf dataset.tar.gz -C "/kaggle/tmp/" .

# TF Records creation

In [None]:
IMG_SIZES = [[512, 512]]*5 
for i,k in enumerate(IMG_SIZES):
    print(k[0],k[1],i)

In [None]:
import tarfile
import cv2

openedfile = tarfile.open('../input/datasetcreation/dataset.tar.gz')
openedfile.extractall()

In [None]:
import os
import pandas as pd
testdf = pd.DataFrame(columns=['ImageID','ImagePath','Split'])
traindf = pd.DataFrame(columns=['ImageID','ImagePath','Split'])
imageIdtest = []
imagePathtest = []
imageIdtrain = []
imagePathtrain = []
splittrain = []
splittest = []
for filename in os.listdir('./test'):
    imageIdtest.append(filename.replace('.jpg',''))
    imagePathtest.append(os.path.join('./test',filename))
for filename in os.listdir('./train'):
    imageIdtrain.append(filename.replace('.jpg',''))
    imagePathtrain.append(os.path.join('./train',filename))
testdf['ImageID'] = imageIdtest
testdf['ImagePath'] = imagePathtest
traindf['ImageID'] = imageIdtrain
traindf['ImagePath'] = imagePathtrain
splittrain.extend(['train' for i in range(traindf.shape[0])])
splittest.extend(['test' for i in range(testdf.shape[0])])
traindf['Split'] = splittrain  
testdf['Split'] = splittest

In [None]:
mergeddf = pd.read_csv('../input/datasetcreation/merged.csv')
mergeddf.drop(columns=['ImagePath','Split'],inplace=True)
mergeddf.drop(columns='Unnamed: 0',inplace=True)

Final DataFrame merging the two dataframes based on ImageID to get the ImagePath within the dataframe

In [None]:
finaltrain = pd.merge(traindf,mergeddf,on=['ImageID'])

In [None]:
finaltrain.to_csv('./finaltrain.csv')

In [None]:
#run through the folder and get all the image paths in a column and image id in the other
def normalise_row(row):
    if row['Negative for Pneumonia'] == 1:
        return 'Negative for Pneumonia'
    elif row['Typical Appearance']==1:
        return 'Typical Appearance'
    elif row['Atypical Appearance'] == 1:
        return 'Atypical Appearance'
    elif row['Indeterminate Appearance']==1:
        return 'Indeterminate Appearance'
    return result

finaltrain['classLabel'] = finaltrain.apply(lambda row : normalise_row(row), axis=1) 

In [None]:
classmap = {'Negative for Pneumonia': 0,
               'Indeterminate Appearance': 1,
 'Atypical Appearance': 2,
    'Typical Appearance': 3
 }
finaltrain['class'] = finaltrain.classLabel.map(classmap)

In [None]:
name2label = {'Typical Appearance': 3,
 'Indeterminate Appearance': 1,
 'Atypical Appearance': 2,
 'Negative for Pneumonia': 0}
class_names = list(name2label.keys())
label2name = {v:k for k, v in name2label.items()}

In [None]:
#We actually get the train and test indices at each iteration
for fold, (train_idx, val_idx) in enumerate(gkf.split(finaltrain, groups=finaltrain.StudyID.tolist())):
    print(len(train_idx), len(val_idx))

In [None]:
#we use the split method in GroupKFold to actually get training and test(in our case validation) indices for the data.
#For each iteration we find the validation indices and mark them up in the 'fold' column so that when we perform k-fold cross validation,
#we could use the data points at these indices to validate
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupKFold.html
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)
finaltrain['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(gkf.split(finaltrain, groups=finaltrain.StudyID.tolist())):
    finaltrain.loc[val_idx,'fold'] = fold
finaltrain.head()

In [None]:
import tensorflow as tf

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def train_example(imageid,image,group,target):
    feature_samples = {
      'image_id': _bytes_feature(imageid),
      'image'   : _bytes_feature(image),
      'group'   : _bytes_feature(group),    
      'target'  : _int64_feature(target),
  }
    example = tf.train.Example(features=tf.train.Features(feature=feature_samples))
    return example

In [None]:
DIM=512
import matplotlib.pyplot as plt
import cv2
def load_image(path, dim=DIM, ch=3):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE if ch==None else cv2.IMREAD_COLOR)
    if img.shape[:2]!=(dim,dim) and dim!=-1:
        img = cv2.resize(img, dsize=(dim,dim), interpolation=cv2.INTER_AREA)
    return img

In [None]:
from tqdm import tqdm
import numpy as np

folds = finaltrain.fold.unique().tolist()
for fold in tqdm(folds): # create tfrecord for each fold
    fold_df = finaltrain[finaltrain.fold==fold]
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(fold,fold_df.shape[0])) as writer:
        samples = fold_df.shape[0] 
        for k in range(samples): # images in fold
            row = fold_df.iloc[k,:]
            image      = load_image(row['ImagePath'], dim=DIM)
            imageid   = row['ImageID']
            group      = row['StudyID']
            target     = np.array(row['class'], dtype=np.uint8)
            example  = train_example(str.encode(imageid),cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 96))[1].tobytes(),str.encode(group),target,)
            writer.write(example.SerializeToString())

In [None]:
def test_example(imageid,image):
    feature_samples = {
        'image_id':_bytes_feature(imageid),
        'image':_bytes_feature(image)
    }
    example = tf.train.Example(features=tf.train.Features(feature=feature_samples))
    return example

In [None]:
folds = 10
l = int(np.ceil(testdf.shape[0]/folds))
for fold in tqdm(range(folds)): # create tfrecord for each fold
    fold_df = testdf.iloc[l*fold:l*(fold+1)]
    with tf.io.TFRecordWriter('test%.2i-%i.tfrec'%(fold,fold_df.shape[0])) as writer:
        samples = fold_df.shape[0] 
        for k in range(samples): # images in fold
            row = fold_df.iloc[k,:]
            imageid   = row['ImageID']
            image      = load_image(row['ImagePath'], dim=DIM)
            example  = test_example(str.encode(imageid),cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 96))[1].tobytes())
            writer.write(example.SerializeToString())

In [None]:
train.head()

In [None]:
import pandas as pd, numpy as np, random,os, shutil
from glob import glob
from kaggle_datasets import KaggleDatasets
import tensorflow as tf, re, math
import tensorflow.keras.backend as K
#import efficientnet.tfkeras as efn
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
print('tf:',tf.__version__)
import math

In [None]:
GCS_PATH = KaggleDatasets().get_gcs_path('siimcovid19tfrecords')
files_train = np.sort(np.array(tf.io.gfile.glob(GCS_PATH+'/train*.tfrec')))
files_test = np.sort(np.array(tf.io.gfile.glob(GCS_PATH+'/test*.tfrec')))

In [None]:
import re, math
def decode_image(image_data):
    image = tf.image.decode_png(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image
def prepare_target(target):    
    target = tf.cast(target, tf.float32)            
    target = tf.reshape(target, [1])         
    return target

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image" : tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "target": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    image  = tf.reshape(image, [DIM, DIM, 3])
    target = prepare_target(example['target'])
    return image, target # returns a dataset of (image, label) pairs

def load_dataset(fileids, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(fileids, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(20, seed=SEED)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(fileids):
    # the number of data items is written in the id of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(fileid).group(1)) for fileid in fileids]
    return np.sum(n)

In [None]:
def display_batch(batch, size=2):
    imgs, tars = batch
    plt.figure(figsize=(size*5, 5))
    for img_idx in range(size):
        plt.subplot(1, size, img_idx+1)
        plt.title(f'class: {label2name[tars[img_idx].numpy()[0]]}', fontsize=15)
        plt.imshow(imgs[img_idx,:, :, :])
        plt.xticks([])
        plt.yticks([])
    plt.tight_layout()
    plt.show() 

In [None]:
DIM = 512
IMAGE_SIZE= [DIM,DIM];
BATCH_SIZE = 32
SEED=42
AUTO = tf.data.experimental.AUTOTUNE
# DISPLAY TRAIN IMAGES
TRAINING_FILENAMES = tf.io.gfile.glob('train*.tfrec')
TEST_FILENAMES     = tf.io.gfile.glob('test*.tfrec')
print('There are %i train & %i test images'%(count_data_items(TRAINING_FILENAMES), count_data_items(TEST_FILENAMES)))
training_dataset = get_training_dataset()
training_dataset = training_dataset.unbatch().batch(20)
train_batch = next(iter(training_dataset))
display_batch(train_batch, 5);

In [None]:
!rm -rf train

In [None]:
finaltrain.to_csv('./train.csv')

In [None]:
df = pd.read_csv('./train.csv')

In [None]:
df.head()

In [None]:
testdf.to_csv('./test.csv')

In [None]:
testdf.head()

In [None]:
!rm -rf train