In [11]:
import time
import json 
from hashlib import sha256
import os
import shutil

import pandas as pd
from PIL import Image
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
root = '/home/jovyan/work'

In [5]:
import logging
import glob

logger = logging.getLogger()
json_files_dir = root + "/data/big_earth/BigEarthNet-V1.0"
csv_output_dir = root + "/data/big_earth/metadata"

logger.info('test')

if not os.path.exists(csv_output_dir):
    os.mkdir(csv_output_dir)

In [12]:
def metadata_files_from_json_to_csv(logger, cloud_and_snow_csv_dir, json_dir, csv_files_path):
    if not os.path.exists(csv_files_path):
        os.mkdir(csv_files_path)
    else:
        shutil.rmtree(csv_files_path)

    # From BigEarth team: we used the same labels of the CORINE Land Cover​ program operated by the European Environment
    # Agency. You can check the label names from
    # https://land.copernicus.eu/user-corner/technical-library/corine-land-cover-nomenclature-guidelines/html/.
    replacements = {
        'Bare rocks': 'Bare rock',
        'Natural grasslands': 'Natural grassland',
        'Peat bogs': 'Peatbogs',
        'Transitional woodland-shrub': 'Transitional woodland/shrub'
    }

    def multi_replace(arr):
        return [replacements[el] if replacements.get(el) is not None else el for el in arr]

    def read_and_augment_metadata(mlb, json_metadata_file):
        with open(json_metadata_file) as fileobj:
            obj = json.load(fileobj)
            obj['labels'] = multi_replace(obj['labels'])
            obj['labels_sha256_hexdigest'] = sha256('-'.join(obj['labels']).encode('utf-8')).hexdigest()
            obj['binarized_labels'] = mlb.transform([obj['labels']])
            obj['image_prefix'] = json_metadata_file.rsplit('/')[-2]
            return obj

    def json_metadata_from_files(json_metadata_files, mlb):
        return [read_and_augment_metadata(json_metadata_file, mlb) for json_metadata_file in json_metadata_files]

    start = time.time()
    glob_path = json_dir + '/**/*.json'
    paths = glob.glob(glob_path)
    logger.info(f"Fetched {len(paths)} paths. in {time.time() - start} seconds.")
    start = time.time()

    # 44 level 3 classes:
    # Currently using:
    # https://land.copernicus.eu/user-corner/technical-library/corine-land-cover-nomenclature-guidelines/html/
    classes = ["Continuous urban fabric", "Discontinuous urban fabric", "Industrial or commercial units",
           "Road and rail networks and associated land", "Port areas", "Airports", "Mineral extraction sites",
           "Dump sites",
           "Construction sites", "Green urban areas", "Sport and leisure facilities", "Non-irrigated arable land",
           "Permanently irrigated land", "Rice fields", "Vineyards", "Fruit trees and berry plantations",
           "Olive groves",
           "Pastures", "Annual crops associated with permanent crops", "Complex cultivation patterns",
           "Land principally occupied by agriculture, with significant areas of natural vegetation",
           "Agro-forestry areas",
           "Broad-leaved forest", "Coniferous forest", "Mixed forest", "Natural grassland", "Moors and heathland",
           "Sclerophyllous vegetation", "Transitional woodland/shrub", "Beaches, dunes, sands", "Bare rock",
           "Sparsely vegetated areas", "Burnt areas", "Glaciers and perpetual snow", "Inland marshes", "Peatbogs",
           "Salt marshes", "Salines", "Intertidal flats", "Water courses", "Water bodies", "Coastal lagoons",
           "Estuaries",
           "Sea and ocean"]

    mlb = MultiLabelBinarizer()
    mlb.fit([classes])
    # sanity check the output
    logger.info(f"Sea and ocean: {mlb.transform([['Sea and ocean']])}")

    json_object_lists = parallelize_task(20, json_metadata_from_files, paths)
    df = pd.concat([pd.DataFrame.from_records(json_object_list) for json_object_list in json_object_lists])
    # Check the dimensions
    assert len(df) == len(paths)
    logger.info(f"Read files into dataframe in {time.time() - start} seconds.")

    # Denote if patch has snow and/or cloudsrandom_state
    snow = pd.read_csv(os.path.join(cloud_and_snow_csv_dir, 'patches_with_seasonal_snow.csv'), header=None, names=['image_prefix'])
    snow_col = 'has_snow'
    snow[snow_col] = 1
    snow = snow.set_index('image_prefix')

    clouds = pd.read_csv(os.path.join(cloud_and_snow_csv_dir, 'patches_with_cloud_and_shadow.csv'), header=None, names=['image_prefix'])
    cloud_col = 'has_cloud_and_shadow'
    clouds[cloud_col] = 1
    clouds = clouds.set_index('image_prefix')

    print(snow.head(3))
    len_snow = len(snow)
    print('\n')
    print(clouds.head(3))
    len_clouds = len(clouds)

    for column in [snow_col, cloud_col]:
        df[column] = 0

    df = df.set_index('image_prefix')
    df.update(snow)
    df.update(clouds)
    assert df[snow_col].sum() == len_snow
    assert df[cloud_col].sum() == len_clouds

    df.to_csv(csv_files_path + '/metadata.csv')

    return df
                
metadata_df = metadata_files_from_json_to_csv(logger=logger, csv_files_path=csv_output_dir,
                                              cloud_and_snow_csv_dir=root + "/data/big_earth", json_dir=json_files_dir)                

NameError: name 'parallelize_task' is not defined

In [80]:
import os

npy_files_path = f"{root}/data/big_earth/npy_files"
if os.path.exists(npy_files_path):
    os.rmdir(npy_files_path, recursive=True)
os.mkdir(npy_files_path)

TypeError: 'recursive' is an invalid keyword argument for rmdir()

In [35]:
metadata = dd.read_csv(csv_output_dir + '/*.csv')
dataset_size = len(metadata)
sample_size = 20000
frac = sample_size / dataset_size
sample = metadata.sample(frac=frac, random_state=0)
print(len(sample))

def image_files_to_npy_file(image_prefix):
    bands = [np.asarray(
    Image.open(f"{root}/data/big_earth/BigEarthNet-v1.0/{image_prefix}/{image_prefix}_B{band}.tif"),
    dtype=np.uint16) for band in ["02", "03", "04"]]
    
    stacked_arr = np.stack(bands, axis=-1)
    np.save(f"{npy_files_path}/{image_prefix}", stacked_arr)

In [36]:
start = time.time()
db.from_sequence(sample['image_prefix'].values, npartitions=50).map(image_files_to_npy_file).compute()
print(time.time() - start)

134.17633366584778


In [103]:
import random
import time

import numpy as np
from tensorflow.keras.utils import Sequence


class AugmentedImageSequence(Sequence):
    def __init__(self, x: np.array, y: np.array, batch_size, augmentations):
        self.x = x
        self.y = y
        self.base_index = [idx for idx in range(len(x))]
        self.batch_size = batch_size
        self.augmentations = augmentations

    def __len__(self):
        return int(np.ceil(len(self.x) / self.batch_size))

    def __getitem__(self, batch_num):
        if batch_num == 0:
            print('getting batch_num', batch_num)
            start = time.time()

        batch_x = self.x[batch_num * self.batch_size:(batch_num + 1) * self.batch_size]

        if self.y is not None:
            batch_y = self.y[batch_num * self.batch_size:(batch_num + 1) * self.batch_size]

        start = time.time()
        images = self.batch_loader(batch_x)

        # training
        if self.y is not None:
            batch_x = np.stack([self.augmentations(image=x)["image"] for x in images], axis=0)

            if batch_num == 0:
                print('fetched batch_num', batch_num, 'in', time.time() - start, 'seconds')

            return batch_x, batch_y
        # test (inference only)
        else:
            return np.array(images)

    def batch_loader(self, image_paths) -> np.array:
        raise NotImplementedError()

    def on_epoch_end(self):
        shuffled_index = self.base_index.copy()
        random.shuffle(shuffled_index)
        self.x = self.x[shuffled_index]

        if self.y is not None:
            self.y = self.y[shuffled_index]

class AugmentedImageSequenceFromNpy(AugmentedImageSequence):
    def __init__(self, x: np.array, y: np.array, batch_size, augmentations):
        super().__init__(x=x, y=y, batch_size=batch_size, augmentations=augmentations)

    def batch_loader(self, image_paths) -> np.array:
        return np.array([np.load(image_path) for image_path in image_paths])
    
    
class AugmentedImageSequenceFromTiff(AugmentedImageSequence):
    def __init__(self, x: np.array, y: np.array, batch_size, augmentations):
        super().__init__(x=x, y=y, batch_size=batch_size, augmentations=augmentations)

    def batch_loader(self, image_paths) -> np.array:
        return np.array([self.load_image_bands_from_disk(image_path) for image_path in image_paths])

    def load_image_bands_from_disk(self, base_filename):
        bands = []
        for band in ["02", "03", "04"]:
            bands.append(np.array(Image.open(base_filename.format(band)), dtype=np.uint16))
        return np.stack(bands, axis=-1)

In [39]:
from albumentations import (
    Compose, Flip, VerticalFlip, Resize, Rotate, ToFloat
)
import time

AUGMENTATIONS_TRAIN = Compose([
    Flip(p=0.5),
    Rotate(limit=(0, 360), p=0.5)
])

AUGMENTATIONS_TEST = Compose([])

In [114]:
tiff_files_path = root + "/data/big_earth/BigEarthNet-V1.0"

xtrain_npy = (npy_files_path + "/" + sample.iloc[:5000]['image_prefix'] + ".npy").values
xtrain_tiff = (tiff_files_path + "/" + sample.iloc[:5000]['image_prefix'] + "/" +
               sample.iloc[:5000]['image_prefix'] + "_B{}.tif").values

ytrain = np.array([np.random.randn(1, 44) for _ in range(len(xtrain))])

In [119]:
batch_size = 128
np_sequence = AugmentedImageSequenceFromNpy(x=xtrain_npy, y=ytrain, batch_size=batch_size,
                                  augmentations=AUGMENTATIONS_TRAIN)
tiff_sequence = AugmentedImageSequenceFromTiff(x=xtrain_tiff, y=ytrain, batch_size=batch_size,
                                  augmentations=AUGMENTATIONS_TRAIN)

def benchmark(sequence):
    start = time.time()
    for x, y in sequence:
        # simulate training step
        time.sleep(0.01)
    print("finished epoch in", time.time() - start, "seconds")

print('np_sequence benchmark')
benchmark(np_sequence)

print('\n')
print('tiff sequence benchmark')
benchmark(tiff_sequence)

np_sequence benchmark
getting batch_num 0
fetched batch_num 0 in 0.606560230255127 seconds
finished epoch in 24.310715675354004 seconds


tiff sequence benchmark
getting batch_num 0
fetched batch_num 0 in 2.3324599266052246 seconds
finished epoch in 96.83438086509705 seconds
