# Segment TIFF images

In [1]:
import csv
import math
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import sklearn.model_selection as msel
from skimage import io
from tqdm import tqdm

In [2]:
DATA_DIR = Path("..") / "data"
LAYER_DIR = DATA_DIR / "layers"

STRIPE_CSV = DATA_DIR / "stripes.csv"

## Look at image properties

For each layer print the height, width, data type, minimum, and maximum values.

In [3]:
FA = io.imread(LAYER_DIR / "fa.tif")
FA.shape, FA.dtype, FA.min(), FA.max()

((41668, 19981), dtype('float32'), 0.0, 3.4e+38)

In [4]:
SLOPE = io.imread(LAYER_DIR / "slope.tif")
SLOPE.shape, SLOPE.dtype, SLOPE.min(), SLOPE.max()

((41668, 19981), dtype('float32'), -3.4028235e+38, 0.9276394)

In [5]:
WETNESS = io.imread(LAYER_DIR / "wetness.tif")
WETNESS.shape, WETNESS.dtype, WETNESS.min(), WETNESS.max()

((41668, 19981), dtype('float32'), -3.4028235e+38, 53.34468)

In [6]:
DEM = io.imread(LAYER_DIR / "dem.tif")
DEM.shape, DEM.dtype, DEM.min(), DEM.max()

((41668, 19981), dtype('float32'), -3.4028235e+38, 53.149834)

In [7]:
LARV = io.imread(LAYER_DIR / "larv_spot_50m_correct.tif")
LARV.shape, LARV.dtype, LARV.min(), LARV.max()

((41668, 19981), dtype('float32'), 0.0, 3.4e+38)

It appears that "Not a Number" (NaN) values area represented by the largest or smallest float32 value.

Let's compare with the true minimum and maximum values for float32. None of the used map values are anywhere near to these extrema which makes them easy to identify and filter.

In [8]:
np.finfo(np.float32).min, np.finfo(np.float32).max

(-3.4028235e+38, 3.4028235e+38)

I'll give myself some wiggle room for the constants that I'll use to detect NA values.

In [9]:
NA_LO = -3.0e38
NA_HI = 3.0e38

## How many tiles can we actually use?

I only have one large image (with 4 layers) for training, validation, and testing. The strategy is to pretend that I've got several images by slicing the large image into several smaller images.

I'll start with an arbitrary tile size of 512 x 512 pixels high & wide.

In [10]:
TILE_SIZE = 512

In [11]:
ROWS, COLS = FA.shape

## Segment the images

Dataset distribution strategy:
1. Slice the images into 81 rows of tile sized data.
2. Randomly assign the rows to the three datasets.
3. Keep the sets the same between runs by pinning random state.

There is a triangle at the top of the images that has no targets. Should I include that? For now, "Yes."

Using a 60/20/20% (train/val/test) split there will be 16 testing and validation stripes and (81 - 32 =) 49 training stripes.

#### Put image rows into datasets

In [12]:
ALL_ROWS = list(range(ROWS // TILE_SIZE))

TRAIN_INDEXES, others = msel.train_test_split(
    ALL_ROWS, train_size=0.61, random_state=4486
)
VAL_INDEXES, TEST_INDEXES = msel.train_test_split(
    others, test_size=0.5, random_state=9241
)

TRAIN_INDEXES = sorted(TRAIN_INDEXES)
VAL_INDEXES = sorted(VAL_INDEXES)
TEST_INDEXES = sorted(TEST_INDEXES)

len(ALL_ROWS), len(TRAIN_INDEXES), len(VAL_INDEXES), len(TEST_INDEXES)

(81, 49, 16, 16)

#### Group adjacent indexes

Now that the row indices are assigned to the datasets I will group rows that are adjacent into one big row. So if row 24 & 25 are both assigned to the "val" dataset I group them into one bigger row. The groups are given as ranges so the 24 & 25 case is written as (24, 26); remember that Python ranges are open at the top. Just to be clear, an ungrouped row like 0 is written as (0, 1).

I do this so that I can squeeze out more tiles from each stripe. If we allow tiles to overlap then grouped rows will allow for many more tiles by allowing the tiles to float vertically. I should be careful with unaugmented tiles (val/test), and limit their overlap. I don't have to use every possible tile in a dataset.

In [13]:
def group_rows(indexes):
    group_beg = indexes[0]
    group_end = group_beg + 1

    grouped = []

    for i in indexes[1:]:
        if i == group_end:
            group_end = i + 1
        else:
            grouped.append((group_beg, group_end))
            group_beg = i
            group_end = i + 1

    grouped.append((group_beg, i + 1))
    return grouped

Print grouped rows.

In [14]:
def print_grouped_rows(indexes, grouped):
    for i in indexes:
        print(i, end=" ")
    print()

    for i in grouped:
        print(i, end=" ")

In [15]:
TRAIN_GROUPS = group_rows(TRAIN_INDEXES)
print_grouped_rows(TRAIN_INDEXES, TRAIN_GROUPS)

0 3 4 5 6 9 10 13 14 16 17 19 21 22 23 26 28 30 31 32 36 37 38 41 42 43 44 45 48 49 52 53 54 57 59 60 62 63 65 66 67 68 69 73 74 75 76 78 79 
(0, 1) (3, 7) (9, 11) (13, 15) (16, 18) (19, 20) (21, 24) (26, 27) (28, 29) (30, 33) (36, 39) (41, 46) (48, 50) (52, 55) (57, 58) (59, 61) (62, 64) (65, 70) (73, 77) (78, 80) 

In [16]:
VAL_GROUPS = group_rows(VAL_INDEXES)
print_grouped_rows(VAL_INDEXES, VAL_GROUPS)

1 7 18 24 25 29 33 35 47 50 55 58 61 64 70 80 
(1, 2) (7, 8) (18, 19) (24, 26) (29, 30) (33, 34) (35, 36) (47, 48) (50, 51) (55, 56) (58, 59) (61, 62) (64, 65) (70, 71) (80, 81) 

In [17]:
TEST_GROUPS = group_rows(TEST_INDEXES)
print_grouped_rows(TEST_INDEXES, TEST_GROUPS)

2 8 11 12 15 20 27 34 39 40 46 51 56 71 72 77 
(2, 3) (8, 9) (11, 13) (15, 16) (20, 21) (27, 28) (34, 35) (39, 41) (46, 47) (51, 52) (56, 57) (71, 73) (77, 78) 

#### Tiles with data

A lot of the potential tiles are completely blank and I don't want to train on them so I'll keep a record of where the data is in each row.

In [18]:
@dataclass
class Stripe:
    dataset: str  # Train, val, test
    row: int  # Top pixel of stripe
    beg: int  # First column with data
    end: int  # Last column with data

#### Does the tile contain data?

In [19]:
def has_data(row, col):
    tile = FA[row : row + TILE_SIZE, col : col + TILE_SIZE]
    flag = ((tile > NA_LO) & (tile < NA_HI)).any()
    return flag

#### Build stripes

In [20]:
def build_stripes(groups, pixel_stride, dataset):
    stripes = []
    for beg, end in tqdm(groups):
        top = beg * TILE_SIZE
        bot = end * TILE_SIZE
        for row in range(top, bot, pixel_stride):
            beg = 999999
            end = -999999
            for col in range(0, COLS, pixel_stride):
                if has_data(row, col):
                    beg = min(beg, col)
                    end = max(end, col)
            stripes.append(Stripe(dataset, row, beg, end))
    return stripes

In [21]:
TRAIN_STRIPES = build_stripes(TRAIN_GROUPS, pixel_stride=8, dataset="train")
print(len(TRAIN_STRIPES))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [22:43<00:00, 68.16s/it]

3136





In [30]:
VAL_STRIPES = build_stripes(VAL_GROUPS, pixel_stride=8, dataset="val")
print(len(VAL_STRIPES))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [07:18<00:00, 29.23s/it]

1024





In [31]:
TEST_STRIPES = build_stripes(TEST_GROUPS, pixel_stride=8, dataset="test")
print(len(TEST_STRIPES))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [07:24<00:00, 34.16s/it]

1024





In [32]:
ALL_STRIPES = TRAIN_STRIPES + VAL_STRIPES + TEST_STRIPES

I'll write this out so that I don't have to do this calculation over and over again.

In [34]:
with open(STRIPE_CSV, "w") as f:
    writer = csv.DictWriter(f, ALL_STRIPES[0].__dict__.keys())
    writer.writeheader()
    for stripe in ALL_STRIPES:
        writer.writerow(stripe.__dict__)