In [None]:
import os
import shutil
import time
import numpy as np
import tensorflow as tf
from keras.models import Model
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import BatchNormalization
from keras.optimizers import SGD
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
from keras.preprocessing.image import ImageDataGenerator

TRAIN_DIR = "ImageNet1000/imagenet-mini/train"
OUTPUT_DIR = "bananas/train"
IMG_HEIGHT = 224
IMG_WIDTH = 224
GREEDY_FILE_LOAD_BATCH_SIZE = 256
SVP_EPOCHS = 50
SVP_BATCH_SIZE = 64
EXTRACTION_PERCENTAGE = 0.65 #0.02712 # range (0, 1)
EXTRACTION_METHOD = "even" # best|even|rebalanced  - best simply takes topK, even is split roughly evenly among classes, rebalanced is balanced according to class distribution in train set
DEBUG_BREAK = 0 # remove

def calculate_entropy(preds):
	entropy = np.zeros(preds.shape[0])

	for i in range(preds.shape[1]):
		entropy += preds[:, i] * np.log(preds[:, i])

	return -entropy

def define_model(height, width):
	model = Sequential()
	model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(height, width, 3)))
	model.add(BatchNormalization())
	model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(BatchNormalization())
	model.add(MaxPooling2D((2, 2)))
	model.add(Dropout(0.2))
	model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(BatchNormalization())
	model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(BatchNormalization())
	model.add(MaxPooling2D((2, 2)))
	model.add(Dropout(0.3))
	model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(BatchNormalization())
	model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(BatchNormalization())
	model.add(MaxPooling2D((2, 2)))
	model.add(Dropout(0.4))
	model.add(Conv2D(256, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(BatchNormalization())
	model.add(Conv2D(256, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(BatchNormalization())
	model.add(MaxPooling2D((2, 2)))
	model.add(Dropout(0.5))
	model.add(Flatten())
	model.add(Dense(128, activation='relu', kernel_initializer='he_uniform', name="final_layer"))
	model.add(BatchNormalization())
	model.add(Dropout(0.6))
	model.add(Dense(1000, activation='softmax'))
 
	# configure learning rate scheduler
	initial_lr = 0.01
	# cosine decay with restarts of warm-up parameter
	first_decay_steps = 10000
	lr_scheduler = CosineDecayRestarts(initial_lr, first_decay_steps)
	# configure optimizer
	opt = SGD(learning_rate=lr_scheduler, momentum=0.9)
	# compile model
	model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
 
	return model

def extract_indices(entropy_index, extraction_size, class_count, class_max, truth):
  # method best
  if class_count is None and class_max is None:
    return entropy_index[:extraction_size]

  # method even
  if class_max is None:
    num = int(np.ceil(extraction_size / len(class_count)))
    total_count = 0
    final_indices = []

    for index in entropy_index:
      cl = truth[index]
      if class_count[cl] < num:
        final_indices.append(index)
        class_count[cl] += 1
        total_count += 1

      if total_count == extraction_size:
        break

    return np.array(final_indices)

  # method balanced
  total_count = 0
  final_indices = []

  for index in entropy_index:
    cl = truth[index]
    if class_count[cl] < num:
      final_indices.append(index)
      class_count[cl] += 1
      total_count += 1

    if total_count == extraction_size:
      break

  return np.array(final_indices)

def gen_image_data(path, files, height, width):
  image_data = []

  for f in files:
    dir = path + "/"
    image_string = tf.io.read_file(dir + f)
    image_decoded = tf.image.decode_jpeg(image_string)
    image_resized = tf.image.resize(image_decoded, [height, width])

    if image_resized.shape[2] == 1:
      image_resized = tf.image.grayscale_to_rgb(image_resized)

    image_data.append(image_resized)

  return tf.convert_to_tensor(image_data)

def gen_one_hot_labels(dirs, map, num_classes=1000):
  labels = np.zeros((map["_stats"]["files_count"], num_classes))
  start_index = 0
  end_index = 0

  #DEBUG_BREAK = 0 # remove
  for i, dir in enumerate(dirs):
    end_index += map[dir]["count"]
    labels[start_index:end_index, i] = 1.0
    start_index += map[dir]["count"]

  #  DEBUG_BREAK += 1 # remove
  #  if DEBUG_BREAK == 5: # remove
  #    break # remove

  return labels

def list_dir(directory):
  dirs = os.listdir(directory)
  classes = {dir: {key: None for key in ["count"]} for dir in dirs} # ["class_num", "count"]
  classes["_stats"] = {key: None for key in ["min", "max", "avg", "median", "files_count", "class_count"]}
  files = {"full_path": [], "class": []} # , "filename": [] , "class": []
  files_count = []

  base_dir = directory + "/"
  #DEBUG_BREAK = 0 # remove
  for i, dir in enumerate(dirs):
    file_list = os.listdir(base_dir + dir)
    sub_dir = dir + "/"
    #files["filename"] += file_list
    file_list = [sub_dir + f for f in file_list]
    files["full_path"] += file_list
    files["class"] += np.full(len(file_list), i).tolist()
    classes[dir]["count"] = len(file_list)
    #classes[dir]["class_num"] = i
    files_count.append(len(file_list))

    #DEBUG_BREAK += 1 # remove
    #if DEBUG_BREAK == 5: # remove
    #  break # remove

  classes["_stats"]["min"] = np.min(files_count)
  classes["_stats"]["max"] = np.max(files_count)
  classes["_stats"]["avg"] = np.mean(files_count)
  classes["_stats"]["median"] = np.median(files_count)
  classes["_stats"]["files_count"] = np.sum(files_count)
  classes["_stats"]["class_count"] = len(dirs)

  return dirs, files, classes

def create_new_dataset(files, indices, dirs, input_path, output_path, force_overwrite=False):
  path_split = output_path.split("/")
  assert os.path.exists(input_path), "input path non-existing"

  if output_path[len(output_path) - 1] == "/":
    path_split = path_split[:-1]

  if os.path.exists(output_path):
    if force_overwrite:
      shutil.rmtree(path_split[0])
      os.makedirs(output_path)
    else:
      assert len(os.listdir(output_path)) == 0, "output path exists and is not empty"
  else:
    try:
      os.makedirs(output_path)
    except Exception:
      print("Failed creating output dir")

  try:
    for i, ind in enumerate(indices):
      dir, filename = files["full_path"][ind].split("/")
      dir_full = output_path + dir if output_path[len(output_path) - 1] == "/" else output_path + "/" + dir

      if not os.path.exists(dir_full):
        os.mkdir(dir_full)

      source_path = input_path + dir + "/" + filename if input_path[len(input_path) - 1] == "/" else input_path + "/" + dir + "/" + filename
      shutil.copy(source_path, dir_full)
  except Exception as e:
    print(e)
    print("Failed to copy " + dir + "/" + filename)


sorted_dirs, files, class_map = list_dir(TRAIN_DIR)
#train_x = gen_image_data(train_dir, files["full_path"], img_height, img_width)
labels = gen_one_hot_labels(sorted_dirs, class_map)

total_files = len(files["full_path"])
total_classes = class_map["_stats"]["class_count"]
total_file_batches = total_files // GREEDY_FILE_LOAD_BATCH_SIZE if total_files % GREEDY_FILE_LOAD_BATCH_SIZE == 0 else total_files // GREEDY_FILE_LOAD_BATCH_SIZE + 1
total_file_batches = 1
SVP_EPOCHS = 1
model = define_model(IMG_HEIGHT, IMG_WIDTH)

print("Starting... Will run " + str(SVP_EPOCHS) + " SVP epochs and greedily load files in batches of " + str(GREEDY_FILE_LOAD_BATCH_SIZE))
print("Total number of files: " + str(total_files) + " | Found classes: " + str(total_classes))

start_time = time.time_ns()
for epoch in range(SVP_EPOCHS):
  print("Epoch " + str(epoch + 1)  + " out of " + str(SVP_EPOCHS))
  index_all = np.random.choice(total_files, size=total_files, replace=False)

  for file_batch in range(total_file_batches):
    print("File batch " + str(file_batch + 1) + " out of " + str(total_file_batches))
    indices = index_all[file_batch * GREEDY_FILE_LOAD_BATCH_SIZE:(file_batch + 1) * GREEDY_FILE_LOAD_BATCH_SIZE]
    file_subset = np.array(files["full_path"])[indices]
    train_x = gen_image_data(TRAIN_DIR, file_subset, IMG_HEIGHT, IMG_WIDTH)
    train_y = tf.convert_to_tensor(labels[indices])

    # preprocess - normalize data
    train_x = train_x / 255.0

    # data augmentation
    datagen = ImageDataGenerator(width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True)
    batch = datagen.flow(train_x, train_y, batch_size=train_x.shape[0], shuffle=False)
    train_x, train_y = next(batch)

    # SVP FIT
    model.fit(train_x, train_y, epochs=1, batch_size=SVP_BATCH_SIZE, verbose=1)

    train_x = None
    train_y = None

preds_all = []
index = 0

for file_batch in range(total_file_batches):
  file_subset = np.array(files["full_path"])[index:index + GREEDY_FILE_LOAD_BATCH_SIZE]
  train_x = gen_image_data(TRAIN_DIR, file_subset, IMG_HEIGHT, IMG_WIDTH)
  train_x = train_x / 255.0
  preds = model.predict(train_x)

  index += GREEDY_FILE_LOAD_BATCH_SIZE
  preds_all += preds.tolist()

entropy = calculate_entropy(np.array(preds))
entropy_index = np.argsort(entropy)[::-1]

print("Max entropy duration", ((time.time_ns() - start_time) / 1000000000), "seconds.")

extraction_size = int(np.floor(total_files * EXTRACTION_PERCENTAGE))
class_count = None
if EXTRACTION_METHOD == "even" or EXTRACTION_METHOD == "rebalanced":
  class_count = np.zeros(total_classes)
class_max = None
if EXTRACTION_METHOD == "rebalanced":
  class_max = np.zeros(total_classes)
  for i in range(total_classes):
    class_max[i] = EXTRACTION_PERCENTAGE * class_map[sorted_dirs[i]]["count"]

  total_max = np.sum(class_max)
  print("TOTALS", extraction_size, total_max)

  if total_max < extraction_size:
    diff = extraction_size - total_max
    index = np.argsort(class_max)[:diff]
    class_max[index] += 1

  if total_max > extraction_size:
    diff = extraction_size - total_max
    index = np.argsort(class_max)[::-1][:diff]
    class_max[index] -= 1

final_indices = extract_indices(entropy_index, extraction_size, class_count, class_max, files["class"])

###
# DO SVP STUFF!
# REMEMBER: DOES IT ALTER SORTING OF TRAINING DATA IN-PLACE?
# PREPROCESS

#entropy_index = [4, 56, 21, 106, 199, 3] # RANDOM TOY INDICES

create_new_dataset(files, final_indices, sorted_dirs, TRAIN_DIR, OUTPUT_DIR, force_overwrite=True) # REMOVE OR FORCE

In [None]:
!python shutdown.py 15