In [1]:
import numpy as np
import cv2
from google.colab.patches import cv2_imshow
from sklearn.model_selection import train_test_split
import imgaug.augmenters as iaa
from pathlib import Path
import random

In [2]:
# Constants
LABELED_ROI_IMAGES_PATH = Path('/content/drive/MyDrive/osteoporosis_data/labeled_rois')
TRAIN_IMAGES_PATH = Path('/content/drive/MyDrive/osteoporosis_data/train')
VAL_IMAGES_PATH = Path('/content/drive/MyDrive/osteoporosis_data/val')
TEST_IMAGES_PATH = Path('/content/drive/MyDrive/osteoporosis_data/test')

TRAIN_RATIO = 0.90
VAL_RATIO = 0.05

### Load data

In [3]:
def load_images(path):
  file_list = [f for f in path.glob('**/*') if f.is_file()]
  images_dict = {}

  for image_path in file_list:
    image_name = image_path.stem

    # TODO: delete if using additional data
    # if image_name[0].islower() and image_name.startswith("os"):
    #   continue

    image = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
    images_dict[image_name] = image

  return images_dict

In [4]:
images_dict = load_images(LABELED_ROI_IMAGES_PATH)
print("Number of images:", len(images_dict))

Number of images: 732


In [5]:
def extract_label(image_name):
  suffix = image_name.split('_')[0]
  label = ''.join([i for i in suffix if not i.isdigit()])
  return label.upper()


def divide_images_by_classes(images_dict):
  norm_images_dict = {}
  op_images_dict = {}
  os_images_dict = {}

  for image_name, image in images_dict.items():
    label = extract_label(image_name)

    if label == "N":
      norm_images_dict[image_name] = image
    elif label == "OP":
      op_images_dict[image_name] = image
    elif label == "OS":
      os_images_dict[image_name] = image
    else:
      raise ValueError(f"Illegal label {label}")

  return norm_images_dict, op_images_dict, os_images_dict

In [6]:
norm_images_dict, op_images_dict, os_images_dict = divide_images_by_classes(images_dict)
print("N images:", len(norm_images_dict))
print("OP images:", len(op_images_dict))
print("OS images:", len(os_images_dict))

N images: 227
OP images: 250
OS images: 255


### Train-Val-Test split
Before augmenting the data, we'll extract the images that'll be used for validation and testing and set them aside

In [7]:
def split_dict_to_train_val_test(data, train_ratio, val_ratio):
    # Shuffle the data
    items = list(data.items())
    random.shuffle(items)

    # Calculate the split indices
    train_idx = int(len(items) * train_ratio)
    val_idx = int(len(items) * (train_ratio + val_ratio))

    # Split the data
    train_data = dict(items[:train_idx])
    val_data = dict(items[train_idx:val_idx])
    test_data = dict(items[val_idx:])

    return train_data, val_data, test_data

In [8]:
train_norm_dict, val_norm_dict, test_norm_dict = split_dict_to_train_val_test(norm_images_dict,
                                                                              train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO)
print("train_norm_dict:", len(train_norm_dict))
print("val_norm_dict:", len(val_norm_dict))
print("test_norm_dict:", len(test_norm_dict))
print()

train_op_dict, val_op_dict, test_op_dict = split_dict_to_train_val_test(op_images_dict,
                                                                        train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO)
print("train_op_dict:", len(train_op_dict))
print("val_op_dict:", len(val_op_dict))
print("test_op_dict:", len(test_op_dict))
print()

train_os_dict, val_os_dict, test_os_dict = split_dict_to_train_val_test(os_images_dict,
                                                                        train_ratio=TRAIN_RATIO, val_ratio=VAL_RATIO)
print("train_os_dict:", len(train_os_dict))
print("val_os_dict:", len(val_os_dict))
print("test_os_dict:", len(test_os_dict))

train_norm_dict: 204
val_norm_dict: 11
test_norm_dict: 12

train_op_dict: 225
val_op_dict: 12
test_op_dict: 13

train_os_dict: 229
val_os_dict: 13
test_os_dict: 13


### Augment the training images

In [9]:
def extract_dict_sub_sample(images_dict, percentage):
  num_of_records = int(len(images_dict) * percentage)
  extracted_items = random.sample(list(images_dict.items()), num_of_records)
  return dict(extracted_items)


def augment_images_dict(images_dict, augmenter, aug_percentage, name_suffix):
  sub_train_images = extract_dict_sub_sample(images_dict, percentage=aug_percentage)
  return {f"{image_name}_{name_suffix}": augmenter(image=image) for image_name, image in sub_train_images.items()}

#### Rotate

In [10]:
rotator = iaa.Sequential([
    iaa.Affine(rotate=(-30, 30))
])

# Apply the augmentation pipeline to each image in the dictionary
rotated_train_norm_dict = augment_images_dict(train_norm_dict, rotator,
                                               aug_percentage=0.1,
                                               name_suffix="rotated")
rotated_train_op_dict = augment_images_dict(train_op_dict, rotator,
                                               aug_percentage=0.1,
                                               name_suffix="rotated")
rotated_train_os_dict = augment_images_dict(train_os_dict, rotator,
                                               aug_percentage=0.1,
                                               name_suffix="rotated")

print("rotated_train_norm_dict", len(rotated_train_norm_dict))
print("rotated_train_op_dict", len(rotated_train_op_dict))
print("rotated_train_os_dict", len(rotated_train_os_dict))

rotated_train_norm_dict 20
rotated_train_op_dict 22
rotated_train_os_dict 22


#### Flip horizontally

In [11]:
flipper = iaa.Fliplr(1.0)

# Apply the augmentation pipeline to each image in the dictionary
flipped_train_norm_dict = augment_images_dict(train_norm_dict, flipper,
                                               aug_percentage=0.1,
                                               name_suffix="flipped")
flipped_train_op_dict = augment_images_dict(train_op_dict, flipper,
                                               aug_percentage=0.1,
                                               name_suffix="flipped")
flipped_train_os_dict = augment_images_dict(train_os_dict, flipper,
                                               aug_percentage=0.1,
                                               name_suffix="flipped")

print("flipped_train_norm_dict", len(flipped_train_norm_dict))
print("flipped_train_op_dict", len(flipped_train_op_dict))
print("flipped_train_os_dict", len(flipped_train_os_dict))

flipped_train_norm_dict 20
flipped_train_op_dict 22
flipped_train_os_dict 22


#### Shift

In [None]:
shifter = iaa.Affine(translate_px={"x": (-30, 30), "y": (-30, 30)})

# Apply the augmentation pipeline to each image in the dictionary
shifted_train_norm_dict = augment_images_dict(train_norm_dict, shifter,
                                               aug_percentage=0.1,
                                               name_suffix="shifted")
shifted_train_op_dict = augment_images_dict(train_op_dict, shifter,
                                               aug_percentage=0.1,
                                               name_suffix="shifted")
shifted_train_os_dict = augment_images_dict(train_os_dict, shifter,
                                               aug_percentage=0.1,
                                               name_suffix="shifted")

print("shifted_train_norm_dict", len(shifted_train_norm_dict))
print("shifted_train_op_dict", len(shifted_train_op_dict))
print("shifted_train_os_dict", len(shifted_train_os_dict))

shifted_train_norm_dict 20
shifted_train_op_dict 22
shifted_train_os_dict 22


#### Noise

In [None]:
# noise = iaa.AdditiveGaussianNoise(scale=(0, 25))

# noisy_train_norm_dict = augment_images_dict(train_norm_dict, noise,
#                                                aug_percentage=0.2,
#                                                name_suffix="noisy")
# noisy_train_op_dict = augment_images_dict(train_op_dict, noise,
#                                                aug_percentage=0.2,
#                                                name_suffix="noisy")
# noisy_train_os_dict = augment_images_dict(train_os_dict, noise,
#                                                aug_percentage=0.2,
#                                                name_suffix="noisy")

# print("noisy_train_norm_dict", len(noisy_train_norm_dict))
# print("noisy_train_op_dict", len(noisy_train_op_dict))
# print("noisy_train_os_dict", len(noisy_train_os_dict))

### Zoom

In [None]:
zoom = iaa.Affine(scale=(1.15, 1.35))

zoomed_train_norm_dict = augment_images_dict(train_norm_dict, zoom,
                                               aug_percentage=0.1,
                                               name_suffix="zoomed")
zoomed_train_op_dict = augment_images_dict(train_op_dict, zoom,
                                               aug_percentage=0.1,
                                               name_suffix="zoomed")
zoomed_train_os_dict = augment_images_dict(train_os_dict, zoom,
                                               aug_percentage=0.1,
                                               name_suffix="zoomed")

print("zoomed_train_norm_dict", len(zoomed_train_norm_dict))
print("zoomed_train_op_dict", len(zoomed_train_op_dict))
print("zoomed_train_os_dict", len(zoomed_train_os_dict))

zoomed_train_norm_dict 20
zoomed_train_op_dict 22
zoomed_train_os_dict 22


### Combine all dictionaries to train-val-test sets

In [None]:
# Training set
augmented_train_dict = train_norm_dict.copy()
augmented_train_dict.update(train_op_dict)
augmented_train_dict.update(train_os_dict)

augmented_train_dict.update(rotated_train_norm_dict)
augmented_train_dict.update(flipped_train_norm_dict)
augmented_train_dict.update(shifted_train_norm_dict)
augmented_train_dict.update(zoomed_train_norm_dict)

augmented_train_dict.update(rotated_train_op_dict)
augmented_train_dict.update(flipped_train_op_dict)
augmented_train_dict.update(shifted_train_op_dict)
augmented_train_dict.update(zoomed_train_op_dict)

augmented_train_dict.update(rotated_train_os_dict)
augmented_train_dict.update(flipped_train_os_dict)
augmented_train_dict.update(shifted_train_os_dict)
augmented_train_dict.update(zoomed_train_os_dict)

# Validation set
val_dict = val_norm_dict.copy()
val_dict.update(val_op_dict)
val_dict.update(val_os_dict)

# Test set
test_dict = test_norm_dict.copy()
test_dict.update(test_op_dict)
test_dict.update(test_os_dict)

print("augmented_train_dict", len(augmented_train_dict))
print("val_dict", len(val_dict))
print("test_dict", len(test_dict))

augmented_train_dict 914
val_dict 36
test_dict 38


### Save data

In [None]:
def save_images_dict(images, path):
  # Check if the directory is empty before adding values
  if any(path.iterdir()):
    raise ValueError("The directory is not empty!")

  for image_name, image in images.items():
    file_path = f"{str(path)}/{image_name}.png"

    if not cv2.imwrite(file_path, image):
      raise Exception("Failed saving", file_path)

In [None]:
save_images_dict(augmented_train_dict, TRAIN_IMAGES_PATH)
save_images_dict(val_dict, VAL_IMAGES_PATH)
save_images_dict(test_dict, TEST_IMAGES_PATH)