# Data Prep
Our dataset already has an appropraite file structure with class divided within each of the folder (train/test).  
In this document, we will just create a validation set.

In [1]:
import os, shutil
import numpy as np
import tensorflow as tf

In [2]:
train_dir = 'DATA/train'
test_dir = 'DATA/test'
val_dir = 'DATA/validation'

### Check for Corrupted Images
(modified the original code from Keras.io)

In [6]:
def move_files(olddir, newdir, fnames):
    for fn in fnames:
        shutil.move(f'{olddir}/{fn}', f'{newdir}/{fn}')

In [8]:
for dirp in (f'{train_dir}/NORMAL', f'{train_dir}/PNEUMONIA', f'{test_dir}/NORMAL', f'{test_dir}/PNEUMONIA'):
    for fn in os.listdir(dirp):
        fp = os.path.join(dirp, fn)
        try:
            fobj = open(fp, "rb")
            is_jfif = tf.compat.as_bytes("JFIF") in fobj.peek(10)
        finally:
            fobj.close()

        if not is_jfif:
            print(f'{fp} appears corrupted')
            try:
                move_files(dirp, f'ARCHIVE/{dirp}', [fn])
            except FileNotFoundError:
                os.makedirs(f'ARCHIVE/{dirp}')
                move_files(dirp, f'ARCHIVE/{dirp}', [fn])
            # os.remove(fp) # remo

DATA/train/NORMAL\NORMAL-1070073-0002.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-1128157-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-1317636-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-1619233-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-1702319-0002.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-1934076-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-2154341-0002.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-2244273-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-2517971-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-2558319-0002.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-2801235-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-3175613-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-3190806-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-3448549-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-3532468-0001.jpeg appears corrupted
DATA/train/NORMAL\NORMAL-3802540-0002.jpeg appears corrupted
DATA/train/NORMAL\NORMAL

DATA/train/PNEUMONIA\VIRUS-4493675-0001.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-4590061-0006.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-4615614-0010.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-4615614-0011.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-463487-0001.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-4670779-0001.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-4728998-0002.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-4766617-0003.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-5028760-0003.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-5123904-0002.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-5202840-0002.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-5284146-0003.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-5410591-0001.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-5445920-0001.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-5875967-0001.jpeg appears corrupted
DATA/train/PNEUMONIA\VIRUS-5882420-0001.jpeg appears cor

In [12]:
fc = sum(len(f) for _, _, f in os.walk('ARCHIVE'))
fc_train = sum(len(f) for _, _, f in os.walk(train_dir))
fc_test = sum(len(f) for _, _, f in os.walk(test_dir))

print(fc, ' files removed. ', fc_train, ' files in train set. ', fc_test, ' files in test set.')

217  files removed.  5053  files in train set.  586  files in test set.


### Subsetting Validation Set

In [3]:
# make necessary directories
def makedirect(paths):
    for path in paths:
        try: os.mkdir(path)
        except FileExistsError: 
            print(f'{path} already exists')

makedirect([val_dir, f'{val_dir}/NORMAL', f'{val_dir}/PNEUMONIA'])

DATA/validation already exists
DATA/validation/NORMAL already exists
DATA/validation/PNEUMONIA already exists


In [27]:
# pull images from training into validation sets
normal_imgs = [fn for fn in os.listdir(f'{train_dir}/NORMAL') if fn.endswith('.jpeg')]
pneumo_imgs = [fn for fn in os.listdir(f'{train_dir}/PNEUMONIA') if fn.endswith('.jpeg')]

# randomly select 15% of it as test set and 15% as validation set
normal_test = np.random.choice(normal_imgs, int(.3 * len(normal_imgs)), replace = False)
pneumo_test = np.random.choice(pneumo_imgs, int(.3 * len(pneumo_imgs)), replace = False)  

normal_val = normal_test[0:int(len(normal_test)/2)]
pneumo_val = pneumo_test[0:int(len(pneumo_test)/2)]
normal_test = normal_test[int(len(normal_test)/2):]
pneumo_test = pneumo_test[int(len(pneumo_test)/2):]

In [28]:
# move files
move_files(f'{train_dir}/NORMAL', f'{val_dir}/NORMAL', normal_val)
move_files(f'{train_dir}/PNEUMONIA', f'{val_dir}/PNEUMONIA', pneumo_val)
move_files(f'{train_dir}/NORMAL', f'{test_dir}/NORMAL', normal_test)
move_files(f'{train_dir}/PNEUMONIA', f'{test_dir}/PNEUMONIA', pneumo_test)

In [29]:
fc_train = sum(len(f) for _, _, f in os.walk(train_dir))
fc_val = sum(len(f) for _, _, f in os.walk(val_dir))

print(fc_train, ' files in train set. ', fc_val, ' files in validation set.')

3949  files in train set.  845  files in validation set.
