In [1]:
import os
import random
import shutil
import cv2
import pandas as pd

### Structure the folders and move images to make them ready for preprocessing

In [2]:
def create_dirs(dir_paths):
    for dir_path in dir_paths:
        try:
            os.makedirs(dir_path)
        except:
            print(f'{dir_path} already exists')

In [3]:
# Original dataset
orig_raw_base_path = os.path.join('data', 'raw_dataset')  
# Raw dataset with all images under one directory
flat_raw_base_path = os.path.join('data', 'flatten_raw_dataset') 
# Create raw_dataset directory if it doesn't exists
create_dirs([flat_raw_base_path])
# Create sub-directories if they don't exist in destination path
sub_dirs = [os.path.join(flat_raw_base_path, 'real_with_mask'), 
            os.path.join(flat_raw_base_path, 'real_without_mask'),
            os.path.join(flat_raw_base_path, 'simulated_with_mask'), 
            os.path.join(flat_raw_base_path, 'simulated_without_mask')]
create_dirs(sub_dirs)

data\flatten_raw_dataset already exists
data\flatten_raw_dataset\real_with_mask already exists
data\flatten_raw_dataset\real_without_mask already exists
data\flatten_raw_dataset\simulated_with_mask already exists
data\flatten_raw_dataset\simulated_without_mask already exists


In [6]:
# Real with and without masks images under raw dataset are under various subdirectories 
# We need to move all imgaes to a single directory and rename them
image_num = 0
sub_dirs = ['real_with_mask', 
            'real_without_mask',
            'simulated_with_mask', 
            'simulated_without_mask']

for dir_num, sub_dir_1 in enumerate(sub_dirs):
    image_num = 1
    
    src_dir_path = os.path.join(orig_raw_base_path, sub_dir_1)
    dest_dir_path = os.path.join(flat_raw_base_path, sub_dir_1)
    
    # If real images sub-dir then images are stored under sub-diretories
    # Get list of next level sub directories
    if dir_num in [0, 1]:
        src_dir_list = os.listdir(src_dir_path)
        # Loop through each of subdirs and rename then copy images under it to destination
        for sub_dir_2 in src_dir_list:
            sub_dir_2_path = os.path.join(src_dir_path, sub_dir_2)
            file_list = os.listdir(sub_dir_2_path)
            # Loop through all the files in this sub_dir and copy to destination
            for file in file_list:
                old_file_name = os.path.join(os.path.join(src_dir_path, sub_dir_2) , file)
                _, extension = os.path.splitext(old_file_name)
                new_file_name = os.path.join(dest_dir_path, str(image_num) + extension)
                shutil.copy(old_file_name, new_file_name)
                image_num += 1

                
    # If simulated image dirs then just rename and copy the images to destination
    else:
        # Loop through all the files in this sub_dir and copy to destination
        file_list = os.listdir(src_dir_path)
        for file in file_list:
            old_file_name = os.path.join(src_dir_path, file)
            _, extension = os.path.splitext(old_file_name)
            new_file_name = os.path.join(dest_dir_path, str(image_num) + extension)
            shutil.copy(old_file_name, new_file_name)
            image_num += 1

### Inspect Image sizes 

In [None]:
# Function for inspecting image sizes and dumping the size details of all images in a dataframe
def get_image_size(image_dir):
    # Get list of images in passed dir
    image_list = os.listdir(image_dir)
    # Loop through image list, extract sizes and store them in a dataframe
    df = pd.DataFrame(columns=['name', 'height', 'width'])
    for img_name in image_list:
        img_path = os.path.join(image_dir, img_name)
        img = cv2.imread(img_path)
        img_height, img_width = img.shape[0], img.shape[1]
        df.loc[len(df)] = [img_name, img_height, img_width]
    return df

In [None]:
# Inspect the Real Mask image sizes 
df = get_image_size(os.path.join(flat_raw_base_path, sub_dirs[0]))
df.to_csv('RFMD_Mask_Sizes.csv')

# Inspect the Real Without Mask image sizes 
df = get_image_size(os.path.join(flat_raw_base_path, sub_dirs[1]))
df.to_csv('RFMD_Without_Mask_Sizes.csv')

# Inspect the Simulated Mask image sizes
df = get_image_size(os.path.join(flat_raw_base_path, sub_dirs[2]))
df.to_csv('SFMD_Mask_Sizes.csv')

# Inspect the Simulated Without Mask image sizes 
df = get_image_size(os.path.join(flat_raw_base_path, sub_dirs[3]))
df.to_csv('SFMD_Without_Mask_Sizes.csv')

### Pre-process images

In [6]:
# Create directories for storing processed images 
processed_data_base_path = os.path.join('data', 'processed_dataset')
train_val_path = os.path.join(processed_data_base_path, 'train_val')
train_val_mask_path = os.path.join(train_val_path, 'mask')
train_val_nomask_path = os.path.join(train_val_path, 'nomask')
test_path = os.path.join(processed_data_base_path, 'test')
test_mask_path = os.path.join(test_path, 'mask')
test_nomask_path = os.path.join(test_path, 'nomask')

dirs = [processed_data_base_path, 
        train_val_path, train_val_mask_path, train_val_nomask_path,
        test_path, test_mask_path, test_nomask_path]
create_dirs(dirs)

In [7]:
# Function for pre-processing the image
def preprocess_image(src_path, dest_path, image_num, prefix):
    image_list = os.listdir(src_path)
    random.shuffle(image_list)
    image_list = image_list[:image_num]
    for image_name in image_list:
        image_src_path = os.path.join(src_path, image_name)
        image_dest_path = os.path.join(dest_path, prefix + image_name)
        img = cv2.imread(image_src_path)
        # Resize the images to 128x128, 
        img = cv2.resize(img,(224, 224), cv2.INTER_AREA)
        # Save image in destination folder
        cv2.imwrite(image_dest_path, img)

In [8]:
# Choose all images from masked set (approx ~5,000 images)
src_path = os.path.join('data', os.path.join('flatten_raw_dataset', 'real_with_mask'))
image_num = len(os.listdir(src_path))
dest_path = train_val_mask_path
# Process and copy images and copy them to train_val folder
preprocess_image(src_path, dest_path, image_num, 'A')

# Choose all images from simulated masked set, process and copy them to train_val folder 
src_path = os.path.join('data', os.path.join('flatten_raw_dataset', 'simulated_with_mask'))
image_num = len(os.listdir(src_path))
# Process and copy images from source to destination folder
preprocess_image(src_path, dest_path, image_num, 'B')
#print(f'{src_path}\n{dest_path}\n{image_num}\n')

mask_image_num = len(os.listdir(dest_path))

In [9]:
# Choose all images from simulated unmasked set
src_path = os.path.join('data', os.path.join('flatten_raw_dataset', 'simulated_without_mask'))
image_num = len(os.listdir(src_path))
dest_path = train_val_nomask_path
# Process and copy images from source to destination folder
preprocess_image(src_path, dest_path, image_num, 'C')

# Out of ~90,000 unmask images, choose number of images randomly so that they are equal to masked images
src_path = os.path.join('data', os.path.join('flatten_raw_dataset', 'real_without_mask'))
dest_path = train_val_nomask_path
# Process and copy images from source to destination folder
preprocess_image(src_path, dest_path, mask_image_num - image_num, 'D')

### Keep 800 mask & unmask images for testing

In [10]:
#
# Move 500 images randomly out of training & validation data. These will not be part of training data
# Final models will be evalutaed on this unseen dataset
#

# Mask images
image_num = 500
src_path = train_val_mask_path
dest_path = test_mask_path
image_list = os.listdir(src_path)
random.shuffle(image_list)
for i in range(image_num):
    src_file_path = os.path.join(src_path, image_list[i])
    dest_file_path = os.path.join(dest_path, image_list[i])
    shutil.move(src_file_path, dest_file_path)

# No Mask images
src_path = train_val_nomask_path
dest_path = test_nomask_path
image_list = os.listdir(src_path)
random.shuffle(image_list)
for i in range(image_num):
    src_file_path = os.path.join(src_path, image_list[i])
    dest_file_path = os.path.join(dest_path, image_list[i])
    shutil.move(src_file_path, dest_file_path)

In [18]:
# shutil.rmtree(os.path.join('data', 'flatten_raw_dataset_OLD'))
# shutil.rmtree(os.path.join('data', 'processed_dataset'))
#shutil.rmtree(os.path.join('data', 'processed_dataset_128'))