Image Splitting

In [None]:
!pip install imutils



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import numpy as np
import os
import glob
import random
import cv2

from PIL import Image, ImageOps
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

from imutils.perspective import four_point_transform
from imutils import grab_contours

In [None]:
RNG_SEED = 0
np.random.seed(RNG_SEED)
random.seed(RNG_SEED)

In [None]:
CLAHE_GRID_SIZE = 16
CLAHE_CLIP_LIMIT = 16.0

In [None]:
def make_dir(output_path):
  if not os.path.isdir(output_path):
    os.mkdir(output_path)

In [None]:
class ImageSplitter:
  def __init__(self, input_dir):
    self.img_list = glob.glob(f'{input_dir}/image/*')
    self.img_list.sort()
    self.msk_list = glob.glob(f'{input_dir}/mask/*')
    self.msk_list.sort()

  def transform(self, tile_shape, stride_shape, output_path, val_size=.15, test_size=.10):
    make_dir(output_path)
    train_dataset, val_dataset, test_dataset = self.__split_dataset(test_size, val_size)
    self.__process_part(train_dataset, tile_shape, stride_shape, f'{output_path}/train')
    self.__process_part(val_dataset, tile_shape, stride_shape, f'{output_path}/val')
    self.__process_part(test_dataset, tile_shape, stride_shape, f'{output_path}/test')

  def __split_dataset(self, test_size, val_size):
    index_list = list(range(len(self.img_list)))
    first_split_ratio = test_size + val_size
    second_split_ratio = test_size / (test_size + val_size)

    train_dataset, test_val_dataset = train_test_split(index_list, test_size=first_split_ratio, shuffle=True)
    val_dataset, test_dataset = train_test_split(test_val_dataset, test_size=second_split_ratio, shuffle=True)
    return train_dataset, val_dataset, test_dataset

  def __process_part(self, index_list, tile_shape, stride_shape, output_path):
    make_dir(output_path)
    make_dir(f'{output_path}/image')
    make_dir(f'{output_path}/mask')
    for index in tqdm(index_list):
      self.__process_element(index, tile_shape, stride_shape, output_path)

  def __process_element(self, img_index, tile_shape, stride_shape, output_path):
    img = Image.open(self.img_list[img_index]).convert('RGB')
    msk = Image.open(self.msk_list[img_index]).convert('L')

    img, msk = self.__precrop_element(img, msk)
    img = self.__equalize_image(img)

    grid_shape = self.__get_grid_shape(img, tile_shape, stride_shape)
    img = self.__pad_image(img, grid_shape, tile_shape, stride_shape)
    msk = self.__pad_image(msk, grid_shape, tile_shape, stride_shape)

    img = np.array(img)
    msk = np.array(msk)
    self.__process_image(img, grid_shape, tile_shape, stride_shape, img_index, f'{output_path}/image')
    self.__process_image(msk, grid_shape, tile_shape, stride_shape, img_index, f'{output_path}/mask')

  def __precrop_element(self, img, msk):
    img = np.asarray(img)
    msk = np.asarray(msk)
    pts = self.__find_contour(img)
    img = four_point_transform(img, pts)
    msk = four_point_transform(msk, pts)
    img = Image.fromarray(img)
    msk = Image.fromarray(msk)
    return img, msk

  def __find_contour(self, img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    cnts = cv2.findContours(gray.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    cnts = grab_contours(cnts)
    cnts = sorted(cnts, key = cv2.contourArea, reverse = True)[:5]
    screenCnt = None
    for c in cnts:
      peri = cv2.arcLength(c, True)
      approx = cv2.approxPolyDP(c, 0.02 * peri, True)
      if len(approx) == 4:
        screenCnt = approx
        break
    return screenCnt.reshape(4, 2)

  def __equalize_image(self, img):
    img = np.asarray(img)
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    clahe = cv2.createCLAHE(
        clipLimit=CLAHE_CLIP_LIMIT,
        tileGridSize=(CLAHE_GRID_SIZE,CLAHE_GRID_SIZE)
    )
    lab[...,0] = clahe.apply(lab[...,0])
    img = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
    return Image.fromarray(img)

  def __get_grid_shape(self, image, tile_shape, stride_shape):
    x_rest = tile_shape[0] - stride_shape[0]
    y_rest = tile_shape[1] - stride_shape[1]
    x_count = (image.size[0] - x_rest - 1) // stride_shape[0] + 1
    y_count = (image.size[1] - y_rest - 1) // stride_shape[1] + 1
    return (x_count, y_count)

  def __pad_image(self, image, grid_shape, tile_shape, stride_shape):
    x_rest = tile_shape[0] - stride_shape[0]
    y_rest = tile_shape[1] - stride_shape[1]
    x_size = grid_shape[0] * stride_shape[0] + x_rest
    y_size = grid_shape[1] * stride_shape[1] + y_rest
    target_size = (x_size, y_size)
    return ImageOps.pad(image, size=target_size)

  def __process_image(
      self,
      image,
      grid_shape, tile_shape, stride_shape,
      img_index,
      output_directory
  ):
    for x in range(grid_shape[0]):
      for y in range(grid_shape[1]):
        self.__process_square(image, (x, y), tile_shape, stride_shape, img_index, output_directory)

  def __process_square(
      self,
      image,
      grid_coord, tile_shape, stride_shape,
      img_index,
      output_directory
  ):
    x_index = grid_coord[0]
    y_index = grid_coord[1]
    x = x_index * stride_shape[0]
    y = y_index * stride_shape[1]
    tile = image[y:y+tile_shape[1],x:x+tile_shape[0]]
    tile = Image.fromarray(tile)
    tile.save(f'{output_directory}/{img_index:02d}_{x_index:03d}_{y_index:03d}.tif')

In [None]:
#IMAGE_TYPE = 'original'
IMAGE_TYPE = 'goodlight-clahe'
TILE_SHAPE = (256,256)
STRIDE_SHAPE = (128, 128)

In [None]:
SPLITTER_BASE_FOLDER = '/content/gdrive/MyDrive/semanticSegmentation'
SPLITTER_INPUT_DIR = f'{SPLITTER_BASE_FOLDER}/{IMAGE_TYPE}'

SPLITTER_OUTPUT_BASE = f'/tmp/semanticSegmentation-{IMAGE_TYPE}'
make_dir(SPLITTER_OUTPUT_BASE)
SPLITTER_OUTPUT_DIR = f'{SPLITTER_OUTPUT_BASE}/{TILE_SHAPE[0]}x{TILE_SHAPE[1]}'

splitter = ImageSplitter(SPLITTER_INPUT_DIR)
splitter.transform(TILE_SHAPE, STRIDE_SHAPE, SPLITTER_OUTPUT_DIR)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Class Distribution

In [None]:
from collections import Counter
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format

In [None]:
SEG_CLASSES = {
    0: "Background",
    1: "Thick Ice",
    2: "Thin Ice",
    3: "Shadow",
    4: "Open Water",
    5: "Unknown"
}

In [None]:
class DistributionIlustrator:
  def __init__(self, input_dir):
    self.msk_list = glob.glob(f'{input_dir}/mask/*')

  def show(self):
    counts = self.__calculate_class_distribution()
    return self.__build_pandas_metrics(counts)

  def __calculate_class_distribution(self):
    counts = Counter()
    for msk_path in self.msk_list:
      counts += self.__get_mask_count(msk_path)
    return counts

  def __get_mask_count(self, msk_path):
    msk = Image.open(msk_path).convert('L')
    unique, counts = np.unique(msk, return_counts=True)
    return dict(zip(unique, counts))

  def __build_pandas_metrics(self, counts):
    max_count = counts.most_common(1)[0][1]

    df = pd.DataFrame.from_dict(SEG_CLASSES, orient='index', columns=['Labels'])
    df['Count'] = df.index.map(counts)
    df['Percentage'] = df['Count'] / sum(counts.values()) * 100
    df['Weight'] = max_count / df['Count']

    df.sort_index(inplace=True)
    return df

In [None]:
selector = DistributionIlustrator(f'{SPLITTER_OUTPUT_DIR}/train')
selector.show()

Unnamed: 0,Labels,Count,Percentage,Weight
0,Background,7264015,0.41,196.31
1,Thick Ice,1425977382,81.03,1.0
2,Thin Ice,204035798,11.59,6.99
3,Shadow,49824908,2.83,28.62
4,Open Water,72670569,4.13,19.62
5,Unknown,0,0.0,inf


In [None]:
selector = DistributionIlustrator(f'{SPLITTER_OUTPUT_DIR}/val') 
selector.show()

Unnamed: 0,Labels,Count,Percentage,Weight
0,Background,1184723,0.34,236.56
1,Thick Ice,280258243,80.96,1.0
2,Thin Ice,47537348,13.73,5.9
3,Shadow,14242071,4.11,19.68
4,Open Water,2938767,0.85,95.37
5,Unknown,0,0.0,inf


Examples Selection

In [None]:
class ExamplesSelector:
  def __init__(self, input_dir):
    self.img_list = glob.glob(f'{input_dir}/image/*')
    self.img_list.sort()
    self.msk_list = glob.glob(f'{input_dir}/mask/*')
    self.msk_list.sort()

  def transform(self, tile_shape, output_path, n_samples=24):
    make_dir(output_path)
    samples_list = self.__select_samples(n_samples)
    self.__process_samples(samples_list, tile_shape, output_path)

  def __select_samples(self, n_samples):
    index_list = list(range(len(self.img_list)))

    index_list = sorted(index_list, key=self.__get_order_metric, reverse=True)
    values_list = [self.__get_order_metric(ind) for ind in index_list]
    threshold_value = values_list[0] - 1
    threshold_index = values_list.index(threshold_value)
    index_list = index_list[:threshold_index]

    return random.choices(index_list, k=n_samples)

  def __get_order_metric(self, img_index):
    msk = Image.open(self.msk_list[img_index]).convert('L')
    unique_elements = np.unique(msk)
    return len(unique_elements)

  def __process_samples(self, index_list, tile_shape, output_path):
    make_dir(output_path)
    make_dir(f'{output_path}/image')
    make_dir(f'{output_path}/mask')
    for index in tqdm(index_list):
      self.__process_element(index, tile_shape, output_path)

  def __process_element(self, img_index, tile_shape, output_path):
    img = Image.open(self.img_list[img_index]).convert('RGB')
    msk = Image.open(self.msk_list[img_index]).convert('L')

    basename = os.path.basename(self.img_list[img_index])
    img.save(f'{output_path}/image/{basename}')
    msk.save(f'{output_path}/mask/{basename}')

In [None]:
SELECTOR_BASE_FOLDER = SPLITTER_OUTPUT_DIR
SELECTOR_INPUT_DIR = f'{SELECTOR_BASE_FOLDER}/val'

SELECTOR_OUTPUT_BASE = f'/tmp/semanticSegmentation-{IMAGE_TYPE}-examples'
make_dir(SELECTOR_OUTPUT_BASE)
SELECTOR_OUTPUT_DIR = f'{SELECTOR_OUTPUT_BASE}/{TILE_SHAPE[0]}x{TILE_SHAPE[1]}'

selector = ExamplesSelector(SELECTOR_INPUT_DIR)
selector.transform(TILE_SHAPE, SELECTOR_OUTPUT_DIR)

  0%|          | 0/24 [00:00<?, ?it/s]

Artifacts Upload

In [None]:
!pip install -q wandb

[K     |████████████████████████████████| 1.7 MB 23.0 MB/s 
[K     |████████████████████████████████| 144 kB 43.6 MB/s 
[K     |████████████████████████████████| 181 kB 50.7 MB/s 
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [None]:
import wandb

In [None]:
WANDB_PROJECT = 'IceClassificationGoodlightCLAHE'
WANDB_ENTITY = 'semanticsegmentation'

run = wandb.init(
    project=WANDB_PROJECT,
    entity=WANDB_ENTITY,
    job_type="dataset-creation"
  )

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
tiles_dataset_artifact = wandb.Artifact(f'ice-tiles-dataset-{IMAGE_TYPE}', type='dataset')
tiles_dataset_artifact.add_dir(SPLITTER_OUTPUT_DIR, name=f'{TILE_SHAPE[0]}x{TILE_SHAPE[1]}')
run.log_artifact(tiles_dataset_artifact)

[34m[1mwandb[0m: Adding directory to artifact (/tmp/semanticSegmentation-goodlight-clahe/256x256)... Done. 249.0s


<wandb.sdk.wandb_artifacts.Artifact at 0x7fd473048a10>

In [None]:
tiles_examples_artifact = wandb.Artifact(f'ice-tiles-examples-{IMAGE_TYPE}', type='dataset')
tiles_examples_artifact.add_dir(SELECTOR_OUTPUT_DIR, name=f'{TILE_SHAPE[0]}x{TILE_SHAPE[1]}')
run.log_artifact(tiles_examples_artifact)

[34m[1mwandb[0m: Adding directory to artifact (/tmp/semanticSegmentation-goodlight-clahe-examples/256x256)... Done. 0.2s


<wandb.sdk.wandb_artifacts.Artifact at 0x7fd46e710590>

In [None]:
run.finish()

VBox(children=(Label(value=' 8116.85MB of 8116.85MB uploaded (1.13MB deduped)\r'), FloatProgress(value=1.0, ma…