<a href="https://colab.research.google.com/github/abuwildanm/food-recognition/blob/master/Create_Custom_Dataset_From_Google_Images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Rename images & categories

In [2]:
import os
# use this to rename the categories of the folder generated by the chrome extension
list_categories = os.listdir('./dataset_no_split')
list_categories_new = [category.replace('foto makanan ', '').replace(' - Google Search', '').replace(' ', '-') for category in list_categories]
map_categories = dict(zip(list_categories, list_categories_new))
# Rename categories
for before, after in map_categories.items():
     os.rename('./dataset_no_split/{}'.format(before), './dataset_no_split/{}'.format(after))

In [3]:
from glob import glob

print('Number of images')
# before processing the images, locate the dataset folder for each category to dataset_no_split folder, such as dataset_no_split/mie_aceh
categories = os.listdir('./dataset_no_split')
for cat in categories:
    print('{}: {}'.format(cat, len(glob('dataset_no_split/{}/*'.format(cat)))))

Number of images
bakso: 440
batagor: 399
bebek-betutu: 324
bubur-manado: 229
gado-gado: 374
gohu-ikan: 102
gudeg: 290
gulai-ikan: 479
kerak-telor: 241
kolak: 461
mie-aceh: 348
nasi-goreng: 518
nasi-uduk: 418
papeda: 114
pempek: 405
pepes-ikan: 186
perkedel-kentang: 473
rawon: 335
rendang: 331
rujak-aceh: 188
rujak-bebek: 124
sate: 509
sate-bandeng: 153
sayur-urap: 342
semur-jengkol: 344
sop-konro: 212
soto: 499
tahu-gejrot: 395
telur-balado: 186
tempe-bacem: 181
tempe-goreng: 368


In [4]:
# Remove invalid images
all_image = glob('dataset_no_split/*/*')
for img_path in all_image:
    if os.path.getsize(img_path) == 0 and os.path.exists(img_path):
        os.remove(img_path)

In [5]:
print('Number of valid images')
categories = os.listdir('./dataset_no_split')
for cat in categories:
    print('{}: {}'.format(cat, len(glob('dataset_no_split/{}/*'.format(cat)))))

Number of valid images
bakso: 440
batagor: 399
bebek-betutu: 324
bubur-manado: 229
gado-gado: 374
gohu-ikan: 102
gudeg: 290
gulai-ikan: 479
kerak-telor: 241
kolak: 461
mie-aceh: 348
nasi-goreng: 518
nasi-uduk: 418
papeda: 114
pempek: 405
pepes-ikan: 186
perkedel-kentang: 473
rawon: 335
rendang: 331
rujak-aceh: 188
rujak-bebek: 124
sate: 509
sate-bandeng: 153
sayur-urap: 342
semur-jengkol: 344
sop-konro: 212
soto: 499
tahu-gejrot: 395
telur-balado: 186
tempe-bacem: 181
tempe-goreng: 368


In [7]:
# Rename each image from each category (ex: mie_aceh-1.jpg, mie_aceh-2.jpg, etc for category mie_aceh)
for cat in categories:
    for i, path in enumerate(sorted(glob('dataset_no_split/{}/*'.format(cat))), 1):
        dirname = os.path.dirname(path)
        src = path
        dst = os.path.join(dirname, '{}-{}.jpg'.format(cat, i))
        # Rename images
        os.rename(src, dst)

## Resize Images

In [8]:
import tensorflow as tf

# The path of all image in dataset
image_path = glob('dataset_no_split/*/*') # change with desired path because some of the dataset already resized
# Resize process
image_size = (224, 224)
for path in image_path:
    try:
        # Load the image from path
        image = tf.keras.preprocessing.image.load_img(path)
        # Resize the image
        image = image.resize(image_size)
        # Save the resized image
        image.save(path)
    except:
        # Remove the image if it is not valid
        print('Remove {}'.format(path))
        os.remove(path)

In [10]:
from keras.preprocessing.image import ImageDataGenerator
from glob import glob
import os
import tensorflow as tf
import random
import shutil

# The path of all image in dataset
image_path = glob('dataset_no_split/*/*')
# The path of all category in dataset
categories = os.listdir('./dataset_no_split')
# The number of images in each category
num_images = [len(glob('dataset_no_split/{}/*'.format(cat))) for cat in categories]

num_augmented_images = [500 - num if num < 500 else 0 for num in num_images]
for cat, num in zip(categories, num_augmented_images):
    print('{}: {}'.format(cat, num))

for cat, num in zip(categories, num_augmented_images):
    # create {num} augmented images
    all_image = glob('dataset_no_split/{}/*'.format(cat))
    os.makedirs('dataset_no_split/augmented_{}'.format(cat), exist_ok=True)
    for i in range(num):
        image = random.choice(all_image)
        image = tf.keras.preprocessing.image.load_img(image)
        image = tf.keras.preprocessing.image.img_to_array(image)
        image = image.reshape((1,) + image.shape)
        datagen = ImageDataGenerator(
            rotation_range=25,
            width_shift_range=0.1,
            height_shift_range=0.1,
            horizontal_flip=True,
            brightness_range=[0.5, 1.2],
            fill_mode='nearest')
        for batch in datagen.flow(image, batch_size=1, save_to_dir='dataset_no_split/augmented_{}'.format(cat), save_prefix='augmented', save_format='jpg'):
            break

    for i, path in enumerate(sorted(glob('dataset_no_split/augmented_{}/*'.format(cat))), 1):
        dirname = os.path.dirname(path)
        src = path
        dst = os.path.join(dirname, '{}-{}.jpg'.format(cat, 500 - num + i))
        os.rename(src, dst)
    # move the augmented image to the category folder
    for img in glob('dataset_no_split/augmented_{}/*'.format(cat)):
        shutil.move(img, 'dataset_no_split/{}'.format(cat))
    shutil.rmtree('dataset_no_split/augmented_{}'.format(cat))

bakso: 0
batagor: 1
bebek-betutu: 2
bubur-manado: 2
gado-gado: 0
gohu-ikan: 9
gudeg: 1
gulai-ikan: 0
kerak-telor: 2
kolak: 0
mie-aceh: 1
nasi-goreng: 0
nasi-uduk: 0
papeda: 6
pempek: 0
pepes-ikan: 7
perkedel-kentang: 0
rawon: 3
rendang: 0
rujak-aceh: 4
rujak-bebek: 4
sate: 0
sate-bandeng: 8
sayur-urap: 3
semur-jengkol: 0
sop-konro: 2
soto: 0
tahu-gejrot: 0
telur-balado: 6
tempe-bacem: 4
tempe-goreng: 1


## Create dataset

In [11]:
# Make train and test directory
os.makedirs('dataset/train', exist_ok=True)
os.makedirs('dataset/test', exist_ok=True)

# Make category directory in train and test
for cat in categories:
    os.makedirs('dataset/train/{}'.format(cat), exist_ok=True)
    os.makedirs('dataset/test/{}'.format(cat), exist_ok=True)

In [12]:
import numpy as np
import shutil

# Pick 20 percent of each category images as test set
for cat in categories:
    test_size = int(len(glob('dataset_no_split/{}/*'.format(cat))) * 0.2)
    all_cat_image = glob('dataset_no_split/{}/*'.format(cat))
    np.random.shuffle(all_cat_image)
    for img_path in all_cat_image:
        if len(os.listdir('dataset/test/{}'.format(cat))) < test_size:
            shutil.move(img_path, 'dataset/test/{}'.format(cat))
        else:
            shutil.move(img_path, 'dataset/train/{}'.format(cat))

In [13]:
categories_split = os.listdir('./dataset/train/')
print('Train Images')
for cat in categories_split:
    print('{}: {}'.format(cat, len(glob('dataset/train/{}/*'.format(cat)))))

print('='*20)
print('Test Images')
for cat in categories_split:
    print('{}: {}'.format(cat, len(glob('dataset/test/{}/*'.format(cat)))))

Train Images
bakso: 400
batagor: 400
bebek-betutu: 400
bubur-manado: 400
gado-gado: 400
gohu-ikan: 400
gudeg: 400
gulai-ikan: 400
kerak-telor: 400
kolak: 400
mie-aceh: 400
nasi-goreng: 415
nasi-uduk: 400
papeda: 400
pempek: 400
pepes-ikan: 400
perkedel-kentang: 400
rawon: 400
rendang: 400
rujak-aceh: 400
rujak-bebek: 400
sate: 408
sate-bandeng: 400
sayur-urap: 400
semur-jengkol: 400
sop-konro: 400
soto: 400
tahu-gejrot: 400
telur-balado: 400
tempe-bacem: 400
tempe-goreng: 400
Test Images
bakso: 100
batagor: 100
bebek-betutu: 100
bubur-manado: 100
gado-gado: 100
gohu-ikan: 100
gudeg: 100
gulai-ikan: 100
kerak-telor: 100
kolak: 100
mie-aceh: 100
nasi-goreng: 103
nasi-uduk: 100
papeda: 100
pempek: 100
pepes-ikan: 100
perkedel-kentang: 100
rawon: 100
rendang: 100
rujak-aceh: 100
rujak-bebek: 100
sate: 101
sate-bandeng: 100
sayur-urap: 100
semur-jengkol: 100
sop-konro: 100
soto: 100
tahu-gejrot: 100
telur-balado: 100
tempe-bacem: 100
tempe-goreng: 100


In [13]:
# Remove parent category directory
for cat in categories:
    shutil.rmtree('dataset_no_split/{}'.format(cat), ignore_errors=True)

In [15]:
from PIL import Image
import numpy as np

all_image = glob('dataset/*/*/*/*')
counter = 0
for img_path in all_image:
    try:
        img = Image.open(img_path)
        img = np.array(img)
        if img.shape != (224, 224, 3):
            print('Remove {}'.format(img_path))
            os.remove(img_path)
        else:
            counter += 1
    except:
        print('Remove {}'.format(img_path))
        os.remove(img_path)

print('Number of valid images: {}'.format(counter))


Number of valid images: 21479
