# Preparation of the Lemon Quality Dataset

Download the dataset from [https://github.com/robotduinom/lemon_dataset](https://github.com/robotduinom/lemon_dataset)

In [1]:
from pathlib import Path
import requests
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
#import rarfile
from unrar import rarfile
import splitfolders

import numpy as np
from tensorflow import keras
import tensorflow as tf

In [10]:
def download_and_unzip(url, extract_to):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)

In [2]:
url = "https://github.com/robotduinom/lemon_dataset/archive/refs/heads/main.zip"
extract_folder = Path.cwd().joinpath("datasets")

In [23]:
download_and_unzip(url,extract_folder)

In [3]:
input_folder = extract_folder.joinpath("lemon_dataset-main", "docs", "data")
input_folder.exists()
#input_folder.is_absolute()

True

In [25]:
rar_path = extract_folder.joinpath("lemon_dataset-main", "docs", "data.rar")
rar_path.exists()

True

In [26]:
rar_extract_path = extract_folder.joinpath("lemon_dataset-main", "docs")

In [27]:

rar = rarfile.RarFile(str(rar_path))

In [28]:
rar.extractall(str(rar_extract_path))

# Split the dataset into training, validation and test data

In [5]:
data_split_seed = 42
#output_folder = Path.cwd().joinpath("datasets", "lemon_dataset")
output_folder = Path.cwd().joinpath("datasets", "lemon_dataset_binary")
output_folder

WindowsPath('i:/tinyml/tiny_cnn/datasets/lemon_dataset_binary')

In [6]:
# Splitting the data into 80% training data, 10% validation data and 10% test data
splitfolders.ratio(input_folder, output=output_folder,
    seed=data_split_seed, ratio=(.8, .1, .1), group_prefix=None, move=False) 

Copying files: 2076 files [00:29, 71.53 files/s]


# Creating a Tensorflow Datagenerator

In [34]:
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [43]:
batch_size = 32
img_height = 92
img_width = 92
shuffle_seed = 42

In [44]:
train_dir = Path.cwd().joinpath("datasets", "lemon_dataset", "train")
val_dir = Path.cwd().joinpath("datasets", "lemon_dataset", "val")
test_dir = Path.cwd().joinpath("datasets", "lemon_dataset", "test")

train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    subset=None,
    seed=shuffle_seed,
    image_size=(img_height, img_width),
    batch_size=batch_size)

val_ds = tf.keras.utils.image_dataset_from_directory(
    val_dir,
    subset=None,
    seed=shuffle_seed,
    image_size=(img_height, img_width),
    batch_size=batch_size)

test_ds = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    subset=None,
    seed=shuffle_seed,
    image_size=(img_height, img_width),
    batch_size=batch_size)

Found 2021 files belonging to 3 classes.
Found 252 files belonging to 3 classes.
Found 255 files belonging to 3 classes.


In [76]:
batch_size = 32
img_height = 92
img_width = 92
shuffle_seed = 42

def get_lemon_quality_dataset(dataset_path, normalize=True):
    """ Fetches the lemon quality dataset and prints dataset info. It normalizes the image data to range [0,1] by default.

    Args: 
        dataset_path (Path): the file location of the dataset. Subfolders "train", "test", and "val" are expected.
        normalize (boolean): Normalizes the image data to range [0, 1]. Default: True

    Returns:
        (train_ds, val_ds, test_ds, class_names) (tuple(tf.datasets)): Tensorflow datasets for train, validation and test.
    
    """
    if dataset_path.exists():
        try:
            train_dir = dataset_path.joinpath("train")
            val_dir = dataset_path.joinpath( "val")
            test_dir = dataset_path.joinpath( "test")
        except:
            print(f"Please check the folder structure of {dataset_path}.")
            raise

    print("Preparing training dataset...")        
    train_ds = tf.keras.utils.image_dataset_from_directory(
        train_dir,
        subset=None,
        seed=shuffle_seed,
        image_size=(img_height, img_width),
        batch_size=batch_size)

    class_names = train_ds.class_names


    print("Preparing validation dataset...")    
    val_ds = tf.keras.utils.image_dataset_from_directory(
        val_dir,
        subset=None,
        seed=shuffle_seed,
        image_size=(img_height, img_width),
        batch_size=batch_size)

    print("Preparing test dataset...")    
    test_ds = tf.keras.utils.image_dataset_from_directory(
        test_dir,
        subset=None,
        seed=shuffle_seed,
        image_size=(img_height, img_width),
        batch_size=batch_size)
    
    # Normalize the data to the range [0, 1]
    if normalize:
        normalization_layer = tf.keras.layers.Rescaling(1./255)

        train_ds= train_ds.map(lambda x, y: (normalization_layer(x), y))
        val_ds= val_ds.map(lambda x, y: (normalization_layer(x), y))
        test_ds= test_ds.map(lambda x, y: (normalization_layer(x), y))
    else:
        pass

    print (f"Class names: {class_names}")
    print(train_ds.element_spec)
    print(f"Normalize: {normalize}")
    return (train_ds, val_ds, test_ds, class_names)

In [77]:
train_ds, val_ds, test_ds, class_names = get_lemon_quality_dataset(output_folder)

Preparing training dataset...
Found 2021 files belonging to 3 classes.
Preparing validation dataset...
Found 252 files belonging to 3 classes.
Preparing test dataset...
Found 255 files belonging to 3 classes.
Class names: ['bad_quality', 'empty_background', 'good_quality']
(TensorSpec(shape=(None, 92, 92, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))
Normalize: True


In [69]:
normalization_layer = tf.keras.layers.Rescaling(1./255)

In [71]:
train_ds_normalized = train_ds.map(lambda x, y: (normalization_layer(x), y))

# Split visual wake words dataset

In [8]:
url = "https://www.silabs.com/public/files/github/machine_learning/benchmarks/datasets/vw_coco2014_96.tar.gz"
extract_folder = Path.cwd().joinpath("datasets", "vvw")

In [12]:
#download_and_unzip(url,extract_folder)

In [16]:
input_folder_vww= Path.cwd().joinpath("datasets", "vw_coco2014_96")
input_folder_vww.is_dir()

True

In [17]:
output_folder_vww= Path.cwd().joinpath("datasets", "visual_wake_words")
output_folder_vww.is_dir()

False

In [None]:
#input_folder_lemon_binary = Path.cwd().joinpath("datasets", "lemon_dataset_binary")

In [18]:
# Splitting the data into 80% training data, 10% validation data and 10% test data
splitfolders.ratio(input_folder_vww, output=output_folder_vww,
    seed=data_split_seed, ratio=(.8, .1, .1), group_prefix=None, move=False) 

Copying files: 109619 files [02:07, 858.08 files/s] 
