# Images for training baseline CNNs
- ## Training data for both Typical (Covid) and Negative classes after dropping other classes
- ## Testing data for covid & negative (80:20)
- ## label encoding classes in dfs

- v1 downsampled covid to match negative each (Negative , Covid =  1709, 1709)
- v2 kept both classes intact and slight imbalance to be handled using k-fold training and precion-recall metrics (Negative , Covid =  1709, 2957)

In [None]:
#!pip install -q efficientnet >> /dev/null

In [None]:
import pandas as pd, numpy as np, random,os, shutil
from glob import glob
from kaggle_datasets import KaggleDatasets
import tensorflow as tf, re, math
import tensorflow.keras.backend as K
#import efficientnet.tfkeras as efn
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
import cv2


print('tf:',tf.__version__)

In [None]:
NP_RANDOM = 2021
SAMPLE_VERSION = 'V2' #change to V1 for downsampling

#clear and make output dir
from shutil import copyfile
import shutil
import os

vfolder = '/kaggle/working/' + SAMPLE_VERSION + '/'

try:
    shutil.rmtree(vfolder)
except Exception as e:
    print(str(e))
finally:
    if not os.path.exists(vfolder):
        os.mkdir(vfolder)
        print("Created " + vfolder)
        

In [None]:
# for competition test folder we don't have labels so can't use them
# from train folder, we will split train-val-test
# remove the duplicate image-ids we found from ljmu-2-preprocessing notebook
img_df = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')
img_df.head(1)

In [None]:
stu_df = pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')
stu_df.head(1)

In [None]:
#merge dfs
stu_df['StudyInstanceUID'] = stu_df['id'].apply(lambda x: x.replace('_study', ''))
del stu_df['id']
img_df = img_df.merge(stu_df, on='StudyInstanceUID')
img_df.sample(3)

In [None]:
#label encoding 0,1,2,3
img_df.loc[img_df['Negative for Pneumonia']==1, 'label'] = 0
img_df.loc[img_df['Typical Appearance']==1, 'label'] = 1
img_df.loc[img_df['Indeterminate Appearance']==1, 'label'] = 2
img_df.loc[img_df['Atypical Appearance']==1, 'label'] = 3
img_df.head(3)

In [None]:
cols_to_drop = ['Negative for Pneumonia','Typical Appearance','Indeterminate Appearance','Atypical Appearance','StudyInstanceUID', 'boxes']
img_df.drop(cols_to_drop, axis=1, inplace=True)
img_df.head(3)

In [None]:
img_df["id"].replace("_image", ".jpg",regex=True, inplace=True)
img_df.head(3)

In [None]:
drop_df = pd.read_csv('../input/images-to-drop/images_to_drop.csv')
drop_df["p1"].replace("_image", ".jpg",regex=True, inplace=True)
drop_df.head(3)


In [None]:
#drop images marked as duplicates
before = len(img_df)
img_df = img_df.loc[~img_df['id'].isin(drop_df['p1'])]
after = len(img_df)
print("before , after = ", before, ",", after)

In [None]:
#filter 0,1 only for further experiments (binary classification)
#../input/siimcovid19256jpg/256-jpg
before = len(img_df)
img_df = img_df.loc[img_df['label'].isin([0,1])]
after = len(img_df)
print("before , after = ", before, ",", after)

In [None]:
print("Negative , Covid = ", len( img_df[(img_df['label']==0)]), len( img_df[(img_df['label']==1)]))

In [None]:
#keep a copy of all rows before split
covid_all = img_df[(img_df['label']==1)]
negative_all = img_df[(img_df['label']==0)]
#shuffle before save
covid_all = covid_all.sample(frac = 1).reset_index(drop=True)
negative_all = negative_all.sample(frac = 1).reset_index(drop=True)
#save
covid_all.to_csv(vfolder + 'covid_all.csv', index=False)
negative_all.to_csv(vfolder + 'negative_all.csv', index=False)

In [None]:
# for v1, use downsampling to address slight imbalanced classes based on RARE class count (negative:1709)
# for v2, don't downsample.. use all available data
RARE_CLASS_COUNT = ( min( len( img_df[(img_df['label']==0)]), len( img_df[(img_df['label']==1)])) )
if SAMPLE_VERSION=='V1':
    np.random.seed(NP_RANDOM)
    img_df = img_df.groupby('label').apply(lambda x : x.sample(RARE_CLASS_COUNT)).reset_index(drop=True)

print (len(img_df))
img_df.head(5)


In [None]:
#train:80% (covid*0.8 + negative*0.8) test:20% (covid*0.2 + negative*0.2)
#from sklearn. model_selection import train_test_split
img_df_0 = img_df[(img_df['label']==0)]
img_df_1 = img_df[(img_df['label']==1)]

train_0 = img_df_0.sample(frac=0.8,random_state=NP_RANDOM) 
train_1 = img_df_1.sample(frac=0.8,random_state=NP_RANDOM) 
test_0 = img_df_0.loc[~img_df_0['id'].isin(train_0['id'])]
test_1 = img_df_1.loc[~img_df_1['id'].isin(train_1['id'])]

train_df = train_0.append(train_1)
test_df = test_0.append(test_1)

train_df['label'] = train_df.label.astype('int')
test_df['label'] = test_df.label.astype('int')

# shuffle the rows
train_df = train_df.sample(frac = 1).reset_index(drop=True)
test_df = test_df.sample(frac = 1).reset_index(drop=True)

#save to file for later loading
train_df.to_csv(vfolder + 'train_df.csv', index=False)
test_df.to_csv(vfolder + 'test_df.csv', index=False)

print("Train has Negative , Covid = ", len( train_df[(train_df['label']==0)]), len( train_df[(train_df['label']==1)]))
print("Test has Negative , Covid = ", len( test_df[(test_df['label']==0)]), len( test_df[(test_df['label']==1)]))

In [None]:
#copy train & train images to folders

def copyImgsToFolder(folder, tlist):
    src = '../input/siimcovid19256jpg/256-jpg/'
    dest = vfolder + folder + '/'
    #print(os. getcwd()) 
    #/kaggle/working
    
    try:
        shutil.rmtree(dest)
    except Exception as e:
        print(str(e))
    finally:
        if not os.path.exists(dest):
            os.mkdir(dest)
            print("Created "+ dest)
        
    for img in tlist:
        copyfile(src+img, dest+img)
        
    print("Copied to " + dest)

In [None]:
train_covid_df = train_df[(train_df['label'] == 1)]
test_covid_df = test_df[(test_df['label'] == 1)]

train_negative_df = train_df[(train_df['label'] == 0)]
test_negative_df = test_df[(test_df['label'] == 0)]

#includes both classes (for CNN)
copyImgsToFolder('train', train_df['id'].tolist())
copyImgsToFolder('test', test_df['id'].tolist())

#includes covid class only (for GAN)
copyImgsToFolder('train-covid', train_covid_df['id'].tolist())
copyImgsToFolder('test-covid', test_covid_df['id'].tolist())

#includes negative class only (for GAN)
copyImgsToFolder('train-negative', train_negative_df['id'].tolist())
copyImgsToFolder('test-negative', test_negative_df['id'].tolist())

#for references only
copyImgsToFolder('covid-all', covid_all['id'].tolist())
copyImgsToFolder('negative-all', negative_all['id'].tolist())

print("Images copied ...")

# TFRecords creation for GAN network training ...
- ## TFRecord for training data (covid, negative)
- ## TFRecord for testing data (covid, negative)

In [None]:
# PATHS TO IMAGES
TRPATH = vfolder+ 'train/'
TEPATH = vfolder+ 'test/'

TRPATH_C = vfolder+ 'train-covid/'
TEPATH_C = vfolder+ 'test-covid/'

TRPATH_N = vfolder+ 'train-negative/'
TEPATH_N = vfolder+ 'test-negative/'

C_ALL = vfolder+ 'covid-all/'
N_ALL = vfolder+ 'negative-all/'

TRIMGS = os.listdir(TRPATH)
TEIMGS = os.listdir(TEPATH)

TRIMGS_C = os.listdir(TRPATH_C)
TEIMGS_C = os.listdir(TEPATH_C)

TRIMGS_N = os.listdir(TRPATH_N)
TEIMGS_N = os.listdir(TEPATH_N)

C_IMGS_ALL = os.listdir(C_ALL)
N_IMGS_ALL = os.listdir(N_ALL)

print (str(len(TRIMGS)) + " " + str(len(TEIMGS)))
TRIMGS[0]

In [None]:
SIZE = 200
#CT = len(TRIMGS)//SIZE + int(len(TRIMGS)%SIZE!=0)
CT = len(TEIMGS)//SIZE + int(len(TEIMGS)%SIZE!=0)
print(len(TEIMGS) // CT)

In [None]:
#TF website
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))




In [None]:
def serialize_example(feature0, feature1, feature2):
  feature = {
      'image': _bytes_feature(feature0), #image data sent in as byte
      'id': _bytes_feature(feature1),    #string data sent in as byte
      'label': _int64_feature(feature2)  #int data sent in as int
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()



In [None]:
def create_tfrecord(df, IMGS, SIZE, PATH, folder):
    
    dest = vfolder + folder + '/'
    try:
        shutil.rmtree(dest)
    except Exception as e:
        print(str(e))
    finally:
        if not os.path.exists(dest):
            os.mkdir(dest)
            print("Created "+dest)
    
    CT = len(IMGS)//SIZE + int(len(IMGS)%SIZE!=0)
    for j in range(CT):
        print(); print('Writing TFRecord %i of %i...'%(j,CT))
        CT2 = min(SIZE,len(IMGS)-j*SIZE)
        with tf.io.TFRecordWriter(dest + 'img%.2i-%i.tfrec'%(j,CT2)) as writer:
            for k in range(CT2):
                img = cv2.imread(PATH+IMGS[SIZE*j+k])
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                name = IMGS[SIZE*j+k]
                row = df.loc[df.id==name]
                example = serialize_example(
                    img, str.encode(row.id.values[0]),
                    row.label.values[0])
                writer.write(example)
                if k%100==0: print(k,', ',end='')
    
    print("TFRecords created for "+ folder)

In [None]:
create_tfrecord(train_df, TRIMGS, 200, TRPATH, 'train-tfrec')

print()
create_tfrecord(test_df, TEIMGS, 200, TEPATH, 'test-tfrec')

create_tfrecord(train_covid_df, TRIMGS_C, 200, TRPATH_C, 'train-covid-tfrec')
print()
create_tfrecord(test_covid_df, TEIMGS_C, 200, TEPATH_C, 'test-covid-tfrec')

create_tfrecord(train_negative_df, TRIMGS_N, 200, TRPATH_N, 'train-negative-tfrec')
print()
create_tfrecord(test_negative_df, TEIMGS_N, 200, TEPATH_N, 'test-negative-tfrec')

create_tfrecord(covid_all, C_IMGS_ALL, 200, C_ALL, 'covid-all-tfrec')
print()
create_tfrecord(negative_all, N_IMGS_ALL, 200, N_ALL, 'negative-all-tfrec')

In [None]:
! ls -l

### Verify TFRecords

In [None]:
# numpy and matplotlib defaults
np.set_printoptions(threshold=15, linewidth=80)
CLASSES = [0,1]

def batch_to_numpy_images_and_labels(data):
    images, labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    #if numpy_labels.dtype == object: # binary string in this case, these are image ID strings
    #    numpy_labels = [None for _ in enumerate(numpy_images)]
    # If no labels, only image IDs, return None for labels (this is the case for test data)
    return numpy_images, numpy_labels

def title_from_label_and_target(label, correct_label):
    if correct_label is None:
        return CLASSES[label], True
    correct = (label == correct_label)
    return "{} [{}{}{}]".format(CLASSES[label], 'OK' if correct else 'NO', u"\u2192" if not correct else '',
                                CLASSES[correct_label] if not correct else ''), correct

def display_one_xray(image, title, subplot, red=False, titlesize=16):
    
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    #if len(title) > 0:
    plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)
    
def display_batch_of_images(databatch, predictions=None):
    """This will work with:
    display_batch_of_images(images)
    display_batch_of_images(images, predictions)
    display_batch_of_images((images, labels))
    display_batch_of_images((images, labels), predictions)
    """
    
    # data
    images, labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels = [None for _ in enumerate(images)]

        
    #print(labels)
    # auto-squaring: this will drop data that does not fit into square or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
        
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols,1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    
    # display
    for i, (image, label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])):
        title = label
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i], label)
            
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one_xray(image, title, subplot, not correct, titlesize=dynamic_titlesize)
    
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "label": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = example['label']
    return image, label # returns a dataset of (image, label) pairs

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. img00-200.tfrec = 200 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
# INITIALIZE VARIABLES  //VERIFY STAGE
IMAGE_SIZE= [256,256]; BATCH_SIZE = 32
AUTO = tf.data.experimental.AUTOTUNE
TRAINING_FILENAMES = tf.io.gfile.glob(vfolder+ 'train-tfrec/img*.tfrec')
TESTING_FILENAMES = tf.io.gfile.glob(vfolder + 'test-tfrec/img*.tfrec')
print('There are %i train and %i test images' % (count_data_items(TRAINING_FILENAMES), count_data_items(TESTING_FILENAMES)))

In [None]:
# DISPLAY TRAIN IMAGES //VERIFY STAGE
training_dataset = get_training_dataset()
training_dataset = training_dataset.unbatch().batch(3)
train_batch = iter(training_dataset)
display_batch_of_images(next(train_batch))

In [None]:
%cd '/kaggle/working'
vfolder

In [None]:
%%time

# zip the output and download
tarname = SAMPLE_VERSION+".tar.gz"
tarfolder = '/kaggle/working/'+SAMPLE_VERSION
!tar -zcf {tarname} -C {tarfolder} .

import os
from IPython.display import FileLink

FileLink('/kaggle/working/'+tarname)


In [None]:
'''
%%time
!tar -zcf train-tfrec.tar.gz -C "/kaggle/working/train-tfrec/" .
!tar -zcf test-tfrec.tar.gz -C "/kaggle/working/test-tfrec/" .

!tar -zcf train-covid-tfrec.tar.gz -C "/kaggle/working/train-covid-tfrec/" .
!tar -zcf test-covid-tfrec.tar.gz -C "/kaggle/working/test-covid-tfrec/" .

!tar -zcf train-negative-tfrec.tar.gz -C "/kaggle/working/train-negative-tfrec/" .
!tar -zcf test-negative-tfrec.tar.gz -C "/kaggle/working/test-negative-tfrec/" .

!tar -zcf covid-train-all-tfrec.tar.gz -C "/kaggle/working/covid-train-all-tfrec/" .
!tar -zcf negative-train-all-tfrec.tar.gz -C "/kaggle/working/negative-train-all-tfrec/" .

!tar -zcf train.tar.gz -C "/kaggle/working/train/" .
!tar -zcf test.tar.gz -C "/kaggle/working/test/" .

!tar -zcf train-covid.tar.gz -C "/kaggle/working/train-covid/" .
!tar -zcf test-covid.tar.gz -C "/kaggle/working/test-covid/" .

!tar -zcf train-negative.tar.gz -C "/kaggle/working/train-negative/" .
!tar -zcf test-negative.tar.gz -C "/kaggle/working/test-negative/" .

!tar -zcf covid-train-all.tar.gz -C "/kaggle/working/covid-train-all/" .
!tar -zcf negative-train-all.tar.gz -C "/kaggle/working/negative-train-all/" .
'''

In [None]:
print("Completed ...")
