<a href="https://colab.research.google.com/github/sripraks/CNN/blob/main/tinyImageNet-200/Build_tiny_imagenet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import the necessary packages
from os import path

# define the paths to the training and validation directories
TRAIN_IMAGES = "../datasets/tiny-imagenet-200/tiny-imagenet-200/train"
VAL_IMAGES = "../datasets/tiny-imagenet-200/tiny-imagenet-200/val/images"

# define the path to the file that maps validation filenames to
# their corresponding class labels
VAL_MAPPINGS = "../datasets/tiny-imagenet-200/tiny-imagenet-200/val/val_annotations.txt"

# define the paths to the WordNet hierarchy files which are used
# to generate our class labels
WORDNET_IDS = "../datasets/tiny-imagenet-200/tiny-imagenet-200/wnids.txt"
WORD_LABELS = "../datasets/tiny-imagenet-200/tiny-imagenet-200/words.txt"

# since we do not have access to the testing data we need to
# take a number of images from the training data and use it instead
NUM_CLASSES = 200
NUM_TEST_IMAGES = 50 * NUM_CLASSES

# define the path to the output training, validation, and testing
# HDF5 files
TRAIN_HDF5 = "../datasets/tiny-imagenet-200/hdf5/train.hdf5"
VAL_HDF5 = "../datasets/tiny-imagenet-200/hdf5/val.hdf5"
TEST_HDF5 = "../datasets/tiny-imagenet-200/hdf5/test.hdf5"

# define the path to the dataset mean
DATASET_MEAN = "output/tiny-image-net-200-mean.json"

# define the path to the output directory used for storing plots,
# classification reports, etc.
OUTPUT_PATH = "output"
MODEL_PATH = path.sep.join([OUTPUT_PATH,"checkpoints", "epoch_70.hdf5"])
FIG_PATH = path.sep.join([OUTPUT_PATH,"deepergooglenet_tinyimagenet.png"])
JSON_PATH = path.sep.join([OUTPUT_PATH,"deepergooglenet_tinyimagenet.json"])

In [None]:
# import the necessary packages
import h5py
import os

class HDF5DatasetWriter:
	def __init__(self, dims, outputPath, dataKey="images",
		bufSize=1000):
		# check to see if the output path exists, and if so, raise
		# an exception
		if os.path.exists(outputPath):
			raise ValueError("The supplied `outputPath` already "
				"exists and cannot be overwritten. Manually delete "
				"the file before continuing.", outputPath)

		# open the HDF5 database for writing and create two datasets:
		# one to store the images/features and another to store the
		# class labels
		self.db = h5py.File(outputPath, "w")
		self.data = self.db.create_dataset(dataKey, dims,
			dtype="float")
		self.labels = self.db.create_dataset("labels", (dims[0],),
			dtype="int")

		# store the buffer size, then initialize the buffer itself
		# along with the index into the datasets
		self.bufSize = bufSize
		self.buffer = {"data": [], "labels": []}
		self.idx = 0

	def add(self, rows, labels):
		# add the rows and labels to the buffer
		self.buffer["data"].extend(rows)
		self.buffer["labels"].extend(labels)

		# check to see if the buffer needs to be flushed to disk
		if len(self.buffer["data"]) >= self.bufSize:
			self.flush()

	def flush(self):
		# write the buffers to disk then reset the buffer
		i = self.idx + len(self.buffer["data"])
		self.data[self.idx:i] = self.buffer["data"]
		self.labels[self.idx:i] = self.buffer["labels"]
		self.idx = i
		self.buffer = {"data": [], "labels": []}

	def storeClassLabels(self, classLabels):
		# create a dataset to store the actual class label names,
		# then store the class labels
		dt = h5py.special_dtype(vlen=str) # `vlen=unicode` for Py2.7
		labelSet = self.db.create_dataset("label_names",
			(len(classLabels),), dtype=dt)
		labelSet[:] = classLabels

	def close(self):
		# check to see if there are any other entries in the buffer
		# that need to be flushed to disk
		if len(self.buffer["data"]) > 0:
			self.flush()

		# close the dataset
		self.db.close()

In [None]:
# import the necessary packages
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imutils import paths
import numpy as np
import progressbar
import json
import cv2
import os

In [None]:
# grab the paths to the training images, then extract the training
# class labels and encode them
trainPaths = list(paths.list_images(TRAIN_IMAGES))
trainLabels = [p.split(os.path.sep)[-3] for p in trainPaths]
le = LabelEncoder()
trainLabels = le.fit_transform(trainLabels)

In [None]:
# perform stratified sampling from the training set to construct a
# a testing set
split = train_test_split(trainPaths, trainLabels,	test_size=NUM_TEST_IMAGES, stratify=trainLabels,	random_state=42)
(trainPaths, testPaths, trainLabels, testLabels) = split

In [None]:
trainPaths


['../datasets/tiny-imagenet-200/tiny-imagenet-200/train\\n04146614\\images\\n04146614_173.JPEG',
 '../datasets/tiny-imagenet-200/tiny-imagenet-200/train\\n02233338\\images\\n02233338_413.JPEG',
 '../datasets/tiny-imagenet-200/tiny-imagenet-200/train\\n03854065\\images\\n03854065_340.JPEG',
 '../datasets/tiny-imagenet-200/tiny-imagenet-200/train\\n07734744\\images\\n07734744_261.JPEG',
 '../datasets/tiny-imagenet-200/tiny-imagenet-200/train\\n02281406\\images\\n02281406_439.JPEG',
 '../datasets/tiny-imagenet-200/tiny-imagenet-200/train\\n09256479\\images\\n09256479_210.JPEG',
 '../datasets/tiny-imagenet-200/tiny-imagenet-200/train\\n01698640\\images\\n01698640_39.JPEG',
 '../datasets/tiny-imagenet-200/tiny-imagenet-200/train\\n02815834\\images\\n02815834_438.JPEG',
 '../datasets/tiny-imagenet-200/tiny-imagenet-200/train\\n04179913\\images\\n04179913_202.JPEG',
 '../datasets/tiny-imagenet-200/tiny-imagenet-200/train\\n07720875\\images\\n07720875_389.JPEG',
 '../datasets/tiny-imagenet-200

In [None]:
# load the validation filename => class from file and then use these mappings to build the validation paths and label lists
M = open(VAL_MAPPINGS).read().strip().split("\n")
M = [r.split("\t")[:2] for r in M]
valPaths = [os.path.sep.join([VAL_IMAGES, m[0]]) for m in M]
valLabels = le.transform([m[1] for m in M])

In [None]:
# construct a list pairing the training, validation, and testing
# image paths along with their corresponding labels and output HDF5
# files
datasets = [("train", trainPaths, trainLabels, TRAIN_HDF5),("val", valPaths, valLabels, VAL_HDF5),("test", testPaths, testLabels, TEST_HDF5)]

In [None]:
# initialize the lists of RGB channel averages
(R, G, B) = ([], [], [])

In [None]:
# loop over the dataset tuples
for (dType, paths, labels, outputPath) in datasets:
	# create HDF5 writer
	print("[INFO] building {}...".format(outputPath))
	writer = HDF5DatasetWriter((len(paths), 64, 64, 3), outputPath)

	# initialize the progress bar
	widgets = ["Building Dataset: ", progressbar.Percentage(), " ",
		progressbar.Bar(), " ", progressbar.ETA()]
	pbar = progressbar.ProgressBar(maxval=len(paths),
		widgets=widgets).start()

	# loop over the image paths
	for (i, (path, label)) in enumerate(zip(paths, labels)):
		# load the image from disk
		image = cv2.imread(path)

		# if we are building the training dataset, then compute the
		# mean of each channel in the image, then update the
		# respective lists
		if dType == "train":
			(b, g, r) = cv2.mean(image)[:3]
			R.append(r)
			G.append(g)
			B.append(b)

		# add the image and label to the HDF5 dataset
		writer.add([image], [label])
		pbar.update(i)

	# close the HDF5 writer
	pbar.finish()

Building Dataset:   1% |                                        | ETA:  0:00:16

[INFO] building ../datasets/tiny-imagenet-200/hdf5/train.hdf5...


Building Dataset: 100% |########################################| Time: 0:20:32
Building Dataset:   0% |                                       | ETA:  --:--:--

[INFO] building ../datasets/tiny-imagenet-200/hdf5/val.hdf5...


Building Dataset: 100% |########################################| Time: 0:01:06
Building Dataset:   0% |                                       | ETA:  --:--:--

[INFO] building ../datasets/tiny-imagenet-200/hdf5/test.hdf5...


Building Dataset: 100% |########################################| Time: 0:02:38


In [None]:
# construct a dictionary of averages, then serialize the means to a
# JSON file
print("[INFO] serializing means...")
D = {"R": np.mean(R), "G": np.mean(G), "B": np.mean(B)}
f = open(DATASET_MEAN, "w")
f.write(json.dumps(D))
f.close()

[INFO] serializing means...


In [None]:
import h5py
filenames = ["../datasets/tiny-imagenet-200/hdf5/train.hdf5", "../datasets/tiny-imagenet-200/hdf5/val.hdf5", "../datasets/tiny-imagenet-200/hdf5/test.hdf5"]
for filename in filenames:
    db = h5py.File(filename, "r")
    print(db["images"].shape)
    db.close()

(90000, 64, 64, 3)
(10000, 64, 64, 3)
(10000, 64, 64, 3)
