<a href="https://colab.research.google.com/github/abuwildanm/food-recognition/blob/master/Create_Custom_Dataset_From_Google_Images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Rename images & categories

In [95]:
import os
# use this to rename the categories of the folder generated by the chrome extension
list_categories = os.listdir('./dataset_no_split')
list_categories_new = [category.replace('foto makanan ', '').replace(' - Google Penelusuran', '').replace(' ', '-') for category in list_categories]
map_categories = dict(zip(list_categories, list_categories_new))
# Rename categories
for before, after in map_categories.items():
     os.rename('./dataset_no_split/{}'.format(before), './dataset_no_split/{}'.format(after))

In [96]:
from glob import glob

print('Number of images')
# before processing the images, locate the dataset folder for each category to dataset_no_split folder, such as dataset_no_split/mie_aceh
categories = os.listdir('./dataset_no_split')
for cat in categories:
    print('{}: {}'.format(cat, len(glob('dataset_no_split/{}/*'.format(cat)))))

Number of images
rujak-aceh: 207
sate-bandeng: 177
sop-konro: 228


In [97]:
# Remove invalid images
all_image = glob('dataset_no_split/*/*')
for img_path in all_image:
    if os.path.getsize(img_path) == 0 and os.path.exists(img_path):
        os.remove(img_path)

In [98]:
print('Number of valid images')
categories = os.listdir('./dataset_no_split')
for cat in categories:
    print('{}: {}'.format(cat, len(glob('dataset_no_split/{}/*'.format(cat)))))

Number of valid images
rujak-aceh: 207
sate-bandeng: 177
sop-konro: 228


In [99]:
# Rename each image from each category (ex: mie_aceh-1.jpg, mie_aceh-2.jpg, etc for category mie_aceh)
for cat in categories:
    for i, path in enumerate(sorted(glob('dataset_no_split/{}/*'.format(cat))), 1):
        dirname = os.path.dirname(path)
        src = path
        dst = os.path.join(dirname, '{}-{}.jpg'.format(cat, i))
        # Rename images
        os.rename(src, dst)

## Resize Images

In [100]:
import tensorflow as tf

# The path of all image in dataset
image_path = glob('dataset_no_split/*/*') # change with desired path because some of the dataset already resized
# Resize process
image_size = (224, 224)
for path in image_path:
    try:
        # Load the image from path
        image = tf.keras.preprocessing.image.load_img(path)
        # Resize the image
        image = image.resize(image_size)
        # Save the resized image
        image.save(path)
    except:
        # Remove the image if it is not valid
        print('Remove {}'.format(path))
        os.remove(path)

In [101]:
# for each category, add more augmented images so that the number of images in each category is 500
from keras.preprocessing.image import ImageDataGenerator
from glob import glob
import os
import tensorflow as tf
import random
import shutil

# The path of all image in dataset
image_path = glob('dataset_no_split/*/*')
# The path of all category in dataset
categories = os.listdir('./dataset_no_split')
# The number of images in each category
num_images = [len(glob('dataset_no_split/{}/*'.format(cat))) for cat in categories]

num_augmented_images = [500 - num if num < 500 else 0 for num in num_images]
for cat, num in zip(categories, num_augmented_images):
    print('{}: {}'.format(cat, num))

for cat, num in zip(categories, num_augmented_images):
    # create {num} augmented images
    all_image = glob('dataset_no_split/{}/*'.format(cat))
    os.makedirs('dataset_no_split/augmented_{}'.format(cat), exist_ok=True)
    for i in range(num):
        image = random.choice(all_image)
        image = tf.keras.preprocessing.image.load_img(image)
        image = tf.keras.preprocessing.image.img_to_array(image)
        image = image.reshape((1,) + image.shape)
        datagen = ImageDataGenerator(
            rotation_range=25,
            width_shift_range=0.1,
            height_shift_range=0.1,
            horizontal_flip=True,
            brightness_range=[0.5, 1.2],
            fill_mode='nearest')
        for batch in datagen.flow(image, batch_size=1, save_to_dir='dataset_no_split/augmented_{}'.format(cat), save_prefix='augmented', save_format='jpg'):
            break

    for i, path in enumerate(sorted(glob('dataset_no_split/augmented_{}/*'.format(cat))), 1):
        dirname = os.path.dirname(path)
        src = path
        dst = os.path.join(dirname, '{}-{}.jpg'.format(cat, 500 - num + i))
        os.rename(src, dst)
    # move the augmented image to the category folder
    for img in glob('dataset_no_split/augmented_{}/*'.format(cat)):
        shutil.move(img, 'dataset_no_split/{}'.format(cat))
    shutil.rmtree('dataset_no_split/augmented_{}'.format(cat))

rujak-aceh: 293
sate-bandeng: 323
sop-konro: 272


## Create dataset

In [102]:
# Make train and test directory
os.makedirs('dataset/train', exist_ok=True)
os.makedirs('dataset/test', exist_ok=True)

# Make category directory in train and test
for cat in categories:
    os.makedirs('dataset/train/{}'.format(cat), exist_ok=True)
    os.makedirs('dataset/test/{}'.format(cat), exist_ok=True)

In [103]:
import numpy as np
import shutil

# Pick 20 percent of each category images as test set
for cat in categories:
    test_size = int(len(glob('dataset_no_split/{}/*'.format(cat))) * 0.2)
    all_cat_image = glob('dataset_no_split/{}/*'.format(cat))
    np.random.shuffle(all_cat_image)
    for img_path in all_cat_image:
        if len(os.listdir('dataset/test/{}'.format(cat))) < test_size:
            shutil.move(img_path, 'dataset/test/{}'.format(cat))
        else:
            shutil.move(img_path, 'dataset/train/{}'.format(cat))

In [104]:
categories_split = os.listdir('./dataset/train/')
print('Train Images')
for cat in categories_split:
    print('{}: {}'.format(cat, len(glob('dataset/train/{}/*'.format(cat)))))

print('='*20)
print('Test Images')
for cat in categories_split:
    print('{}: {}'.format(cat, len(glob('dataset/test/{}/*'.format(cat)))))

Train Images
bubur-manado: 400
gohu-ikan: 396
papeda: 392
rujak-aceh: 394
rujak-bebek: 394
sate-bandeng: 398
sayur-urap: 398
sop-konro: 400
Test Images
bubur-manado: 99
gohu-ikan: 99
papeda: 98
rujak-aceh: 98
rujak-bebek: 98
sate-bandeng: 99
sayur-urap: 99
sop-konro: 99


In [105]:
# Remove parent category directory
for cat in categories:
    shutil.rmtree('dataset_no_split/{}'.format(cat), ignore_errors=True)

In [106]:
from PIL import Image
import numpy as np

all_image = glob('dataset/*/*/*/*')
counter = 0
for img_path in all_image:
    try:
        img = Image.open(img_path)
        img = np.array(img)
        if img.shape != (224, 224, 3):
            print('Remove {}'.format(img_path))
            os.remove(img_path)
        else:
            counter += 1
    except:
        print('Remove {}'.format(img_path))
        os.remove(img_path)

print('Number of valid images: {}'.format(counter))


Number of valid images: 12851
