<a href="https://colab.research.google.com/github/abuwildanm/food-recognition/blob/master/Create_Custom_Dataset_From_Google_Images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Rename images & categories

In [14]:
import os
# use this to rename the categories of the folder generated by the chrome extension
list_categories = os.listdir('./dataset_no_split')
list_categories_new = [category.replace('foto makanan ', '').replace(' - Google Penelusuran', '').replace(' ', '-') for category in list_categories]
map_categories = dict(zip(list_categories, list_categories_new))
# Rename categories
for before, after in map_categories.items():
     os.rename('./dataset_no_split/{}'.format(before), './dataset_no_split/{}'.format(after))

In [15]:
from glob import glob

print('Number of images')
# before processing the images, locate the dataset folder for each category to dataset_no_split folder, such as dataset_no_split/mie_aceh
categories = os.listdir('./dataset_no_split')
for cat in categories:
    print('{}: {}'.format(cat, len(glob('dataset_no_split/{}/*'.format(cat)))))

Number of images
.gitkeep: 0
perkedel-kentang: 476
tahu-gejrot: 399


In [16]:
# Remove invalid images
all_image = glob('dataset_no_split/*/*')
for img_path in all_image:
    if os.path.getsize(img_path) == 0 and os.path.exists(img_path):
        os.remove(img_path)

In [17]:
print('Number of valid images')
categories = os.listdir('./dataset_no_split')
for cat in categories:
    print('{}: {}'.format(cat, len(glob('dataset_no_split/{}/*'.format(cat)))))

Number of valid images
.gitkeep: 0
perkedel-kentang: 476
tahu-gejrot: 399


In [20]:
# Rename each image from each category (ex: mie_aceh-1.jpg, mie_aceh-2.jpg, etc for category mie_aceh)
for cat in categories:
    for i, path in enumerate(sorted(glob('dataset_no_split/{}/*'.format(cat))), 1):
        dirname = os.path.dirname(path)
        src = path
        dst = os.path.join(dirname, '{}-{}.jpg'.format(cat, i))
        # Rename images
        os.rename(src, dst)

## Resize Images

In [22]:
import tensorflow as tf

# The path of all image in dataset
image_path = glob('dataset_no_split/*/*') # change with desired path because some of the dataset already resized
# Resize process
image_size = (224, 224)
for path in image_path:
    try:
        # Load the image from path
        image = tf.keras.preprocessing.image.load_img(path)
        # Resize the image
        image = image.resize(image_size)
        # Save the resized image
        image.save(path)
    except:
        # Remove the image if it is not valid
        print('Remove {}'.format(path))
        os.remove(path)

Remove dataset_no_split\tahu-gejrot\tahu-gejrot-83.jpg


## Create dataset

In [23]:
# Make train and test directory
os.makedirs('dataset/train', exist_ok=True)
os.makedirs('dataset/test', exist_ok=True)

# Make category directory in train and test
for cat in categories:
    os.makedirs('dataset/train/{}'.format(cat), exist_ok=True)
    os.makedirs('dataset/test/{}'.format(cat), exist_ok=True)

In [24]:
import numpy as np
import shutil

# Pick 20 percent of each category images as test set
for cat in categories:
    test_size = int(len(glob('dataset_no_split/{}/*'.format(cat))) * 0.2)
    all_cat_image = glob('dataset_no_split/{}/*'.format(cat))
    np.random.shuffle(all_cat_image)
    for img_path in sorted(all_cat_image):
        if len(os.listdir('dataset/test/{}'.format(cat))) <= test_size:
            shutil.move(img_path, 'dataset/test/{}'.format(cat))
        else:
            shutil.move(img_path, 'dataset/train/{}'.format(cat))

In [25]:
categories_split = os.listdir('./dataset/train/')
print('Train Images')
for cat in categories_split:
    print('{}: {}'.format(cat, len(glob('dataset/train/{}/*'.format(cat)))))

print('='*20)
print('Test Images')
for cat in categories_split:
    print('{}: {}'.format(cat, len(glob('dataset/test/{}/*'.format(cat)))))

Train Images
.gitkeep: 0
bakso: 315
bala-bala: 272
batagor: 321
bebek-betutu: 238
bika-ambon: 320
dadar-gulung: 151
gado-gado: 263
gehu: 209
gudeg: 213
gulai-ikan: 384
kerak-telor: 192
kolak: 369
kue-cubit: 151
mie-aceh: 285
nasi-goreng: 366
nasi-uduk: 344
otak-otak: 315
pempek: 286
pepes-ikan: 148
perkedel-kentang: 380
pisang-goreng: 316
putu-ayu: 151
rawon: 235
rendang: 237
sate: 355
semur-jengkol: 275
soto: 358
tahu-gejrot: 318
telur-balado: 148
tempe-bacem: 145
tempe-goreng: 303
Test Images
.gitkeep: 0
bakso: 130
bala-bala: 69
batagor: 81
bebek-betutu: 97
bika-ambon: 77
dadar-gulung: 38
gado-gado: 113
gehu: 53
gudeg: 82
gulai-ikan: 97
kerak-telor: 49
kolak: 93
kue-cubit: 38
mie-aceh: 65
nasi-goreng: 155
nasi-uduk: 87
otak-otak: 80
pempek: 123
pepes-ikan: 38
perkedel-kentang: 96
pisang-goreng: 79
putu-ayu: 39
rawon: 104
rendang: 94
sate: 154
semur-jengkol: 69
soto: 141
tahu-gejrot: 80
telur-balado: 38
tempe-bacem: 37
tempe-goreng: 77


In [13]:
# Remove parent category directory
for cat in categories:
    shutil.rmtree('dataset_no_split/{}'.format(cat), ignore_errors=True)