# Data Pre-Processing

In this file, the data preparation phase is covered. Data pre-processing is performed on two datasets, Celeb-A Faces Dataset and Flowers Dataset. Pre-processing is in the form of Image Reshaping and Data Splitting.

For Celeb-A Faces Dataset, 80-20 split has been performed for acquiring training and validation samples and image reisze of 108x108 size is performed.

For Flowers Dataset, 70-30 split has been performed for acquiring training and validation samples and image reisze of 108x108 size is performed.

It is noted that there is no pre-processing performed at this stage on the third dataset, i.e. the Pokemon Dataset because the images are originally sized at the desired requirements.

In [1]:
# Importing required libraries.
import os
from PIL import Image
import splitfolders as sf

In [2]:
# Providing path for the Celeb-A Faces dataset and saving the processed data.
data_dir = './data/celeba/main/'
save_train = './data/celeba/resized_celeba/train/data/'
save_test = './data/celeba/resized_celeba/val/data/'

image_size = 64
crop_size = 108

In [3]:
# Split with a ratio of 70-30.
sf.ratio(data_dir, output="./data/celeba/output", seed=1234, ratio=(.7, .3), group_prefix=None) # default values

Copying files: 202599 files [05:18, 636.72 files/s]


In [6]:
celeba_train = './data/celeba/output/train/data/'
celeba_test = './data/celeba/output/val/data/'

In [7]:
if not os.path.isdir(save_train):
    os.mkdir(save_train)

img_list = os.listdir(celeba_train)

In [8]:
# Code for resizing the Celeb-A dataset training images 
for i in range(100000):
    img = Image.open(celeba_train + img_list[i])
    c_x = (img.size[0] - crop_size) // 2
    c_y = (img.size[1] - crop_size) // 2
    img = img.crop([c_x, c_y, c_x + crop_size, c_y + crop_size])
    img = img.resize((image_size, image_size), Image.BILINEAR)
    img.save(save_train + img_list[i], 'JPEG')

    if i % 1000 == 0:
        print('Resizing %d images...' % i)

Resizing 0 images...
Resizing 1000 images...
Resizing 2000 images...
Resizing 3000 images...
Resizing 4000 images...
Resizing 5000 images...
Resizing 6000 images...
Resizing 7000 images...
Resizing 8000 images...
Resizing 9000 images...
Resizing 10000 images...
Resizing 11000 images...
Resizing 12000 images...
Resizing 13000 images...
Resizing 14000 images...
Resizing 15000 images...
Resizing 16000 images...
Resizing 17000 images...
Resizing 18000 images...
Resizing 19000 images...
Resizing 20000 images...
Resizing 21000 images...
Resizing 22000 images...
Resizing 23000 images...
Resizing 24000 images...
Resizing 25000 images...
Resizing 26000 images...
Resizing 27000 images...
Resizing 28000 images...
Resizing 29000 images...
Resizing 30000 images...
Resizing 31000 images...
Resizing 32000 images...
Resizing 33000 images...
Resizing 34000 images...
Resizing 35000 images...
Resizing 36000 images...
Resizing 37000 images...
Resizing 38000 images...
Resizing 39000 images...
Resizing 4000

In [9]:
if not os.path.isdir(save_test):
    os.mkdir(save_test)

img_list = os.listdir(celeba_test)

In [10]:
# Code for resizing the Celeb-A dataset testing images
for i in range(60000):
    img = Image.open(celeba_test + img_list[i])
    c_x = (img.size[0] - crop_size) // 2
    c_y = (img.size[1] - crop_size) // 2
    img = img.crop([c_x, c_y, c_x + crop_size, c_y + crop_size])
    img = img.resize((image_size, image_size), Image.BILINEAR)
    img.save(save_test + img_list[i], 'JPEG')

    if i % 1000 == 0:
        print('Resizing %d images...' % i)

Resizing 0 images...
Resizing 1000 images...
Resizing 2000 images...
Resizing 3000 images...
Resizing 4000 images...
Resizing 5000 images...
Resizing 6000 images...
Resizing 7000 images...
Resizing 8000 images...
Resizing 9000 images...
Resizing 10000 images...
Resizing 11000 images...
Resizing 12000 images...
Resizing 13000 images...
Resizing 14000 images...
Resizing 15000 images...
Resizing 16000 images...
Resizing 17000 images...
Resizing 18000 images...
Resizing 19000 images...
Resizing 20000 images...
Resizing 21000 images...
Resizing 22000 images...
Resizing 23000 images...
Resizing 24000 images...
Resizing 25000 images...
Resizing 26000 images...
Resizing 27000 images...
Resizing 28000 images...
Resizing 29000 images...
Resizing 30000 images...
Resizing 31000 images...
Resizing 32000 images...
Resizing 33000 images...
Resizing 34000 images...
Resizing 35000 images...
Resizing 36000 images...
Resizing 37000 images...
Resizing 38000 images...
Resizing 39000 images...
Resizing 4000

In [11]:
# Providing path for the Flowers dataset and saving the processed data.
flowers_data_dir = './data/flowers/main/'
flowers_save_train = './data/flowers/resized_flowers/train/data/'
flowers_save_test = './data/flowers/resized_flowers/val/data/'

image_size = 64
crop_size = 108

In [12]:
# Split with a ratio of 80-20.
sf.ratio(flowers_data_dir, output="./data/flowers/output", seed=1234, ratio=(.8, .2), group_prefix=None) # default values

Copying files: 8189 files [00:17, 478.96 files/s]


In [13]:
train_dir = './data/flowers/output/train/jpg/'
test_dir = './data/flowers/output/val/jpg/'

In [14]:
if not os.path.isdir(flowers_save_train):
    os.mkdir(flowers_save_train)

img_list = os.listdir(train_dir)

In [15]:
# Code for resizing the Flowers dataset training images
for i in range(7500):
    img = Image.open(train_dir + img_list[i])
    c_x = (img.size[0] - crop_size) // 2
    c_y = (img.size[1] - crop_size) // 2
    img = img.crop([c_x, c_y, c_x + crop_size, c_y + crop_size])
    img = img.resize((image_size, image_size), Image.BILINEAR)
    img.save(flowers_save_train + img_list[i], 'JPEG')

    if i % 1000 == 0:
        print('Resizing %d images...' % i)

Resizing 0 images...
Resizing 1000 images...
Resizing 2000 images...
Resizing 3000 images...
Resizing 4000 images...
Resizing 5000 images...
Resizing 6000 images...
Resizing 7000 images...


In [16]:
if not os.path.isdir(flowers_save_test):
    os.mkdir(flowers_save_test)

img_list = os.listdir(test_dir)

In [17]:
# Code for resizing the Flowers dataset testing images
for i in range(2900):
    img = Image.open(test_dir + img_list[i])
    c_x = (img.size[0] - crop_size) // 2
    c_y = (img.size[1] - crop_size) // 2
    img = img.crop([c_x, c_y, c_x + crop_size, c_y + crop_size])
    img = img.resize((image_size, image_size), Image.BILINEAR)
    img.save(flowers_save_test + img_list[i], 'JPEG')

    if i % 1000 == 0:
        print('Resizing %d images...' % i)

Resizing 0 images...
Resizing 1000 images...
Resizing 2000 images...
