In [1]:
!pip install albumentations tensorflow tensorflow-io Pillow pyarrow==0.15.* libtiff

Collecting albumentations
[?25l  Downloading https://files.pythonhosted.org/packages/f6/c4/a1e6ac237b5a27874b01900987d902fe83cc469ebdb09eb72a68c4329e78/albumentations-0.4.3.tar.gz (3.2MB)
[K     |████████████████████████████████| 3.2MB 4.8MB/s eta 0:00:01
[?25hCollecting tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/1d/56/0dbdae2a3c527a119bec0d5cf441655fe030ce1daa6fa6b9542f7dbd8664/tensorflow-2.1.0-cp37-cp37m-manylinux2010_x86_64.whl (421.8MB)
[K     |████████████████████████████████| 421.8MB 27kB/s s eta 0:00:01     |████████████████▏               | 213.8MB 28.9MB/s eta 0:00:08     |█████████████████████████████▋  | 390.8MB 29.3MB/s eta 0:00:02     |██████████████████████████████▍ | 400.4MB 29.3MB/s eta 0:00:01
[?25hCollecting tensorflow-io
[?25l  Downloading https://files.pythonhosted.org/packages/12/bf/d6845f480310a6bfd070cafda5f87f9c534e8d8293010c18119c739ec00e/tensorflow_io-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl (23.7MB)
[K     |███████████████

Collecting requests-oauthlib>=0.7.0
  Downloading https://files.pythonhosted.org/packages/a3/12/b92740d845ab62ea4edf04d2f4164d82532b5a0b03836d4d4e71c6f3d379/requests_oauthlib-1.3.0-py2.py3-none-any.whl
Collecting pyasn1-modules>=0.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/95/de/214830a981892a3e286c3794f41ae67a4495df1108c3da8a9f62159b9a9d/pyasn1_modules-0.2.8-py2.py3-none-any.whl (155kB)
[K     |████████████████████████████████| 163kB 29.3MB/s eta 0:00:01
[?25hCollecting rsa<4.1,>=3.1.4
  Downloading https://files.pythonhosted.org/packages/02/e5/38518af393f7c214357079ce67a317307936896e961e35450b70fad2a9cf/rsa-4.0-py2.py3-none-any.whl
Collecting cachetools<5.0,>=2.0.0
  Downloading https://files.pythonhosted.org/packages/08/6a/abf83cb951617793fd49c98cb9456860f5df66ff89883c8660aa0672d425/cachetools-4.0.0-py3-none-any.whl
Collecting pyasn1<0.5.0,>=0.4.6
[?25l  Downloading https://files.pythonhosted.org/packages/62/1e/a94a8d635fa3ce4cfc7f506003548d0a2447ae76fd5ca539

In [2]:
import os
import random
import time

import numpy as np
import pyarrow as pa

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
import tensorflow as tf

root = '/home/jovyan/work'
# Based on `core-site.xml` in https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md doc
spark = SparkSession.builder \
    .master("local") \
    .appName("big_earth") \
    .config("spark.driver.extraClassPath", root + "/spark_dependencies/gcs-connector-hadoop2-latest.jar") \
    .config("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .config("fs.gs.project.id", "big-earth-252219") \
    .config("google.cloud.auth.service.account.enable", "true") \
    .config("google.cloud.auth.service.account.json.keyfile", root + "/.gcs/big-earth-252219-fb2e5c109f78.json") \
    .getOrCreate()
sc = spark.sparkContext

In [358]:
x = 'S2A_MSIL2A_20170613T101031_6_59'
compressed_array = io.BytesIO()

img = Image.open(f"{root}/data/{x}/{x}_B02.tif")
# img.save(compressed_array, format="tiff", compression="tiff_deflate")

# b = io.BytesIO(compressed_array.getbuffer())
# l = Image.open(b)
# np.array(l)

# img = imageio.imread(f"{root}/data/{x}/{x}_B02.tif", 'tiff')
# imageio.imsave(uri=compressed_array, im=img, format="tiff", compression="9")

In [13]:
metadata = spark.read.format("csv").option('header', 'true').load(root + '/data/metadata.csv')

In [14]:
dataset_size = metadata.count()
sample_size = 20000
frac = sample_size / dataset_size

sample = metadata.sample(withReplacement=False, fraction=frac, seed=0)
sample.count()

20069

In [15]:
image_prefixes = sample.select("image_prefix").collect()
image_prefixes[:5]

[Row(image_prefix='S2A_MSIL2A_20180527T093041_44_28'),
 Row(image_prefix='S2A_MSIL2A_20180219T094031_28_78'),
 Row(image_prefix='S2A_MSIL2A_20180228T101021_87_40'),
 Row(image_prefix='S2B_MSIL2A_20180224T112109_24_0'),
 Row(image_prefix='S2A_MSIL2A_20170617T113321_84_62')]

In [28]:
output_url = 'file://' + root + '/data/parquet_images'
fields = [StructField("image_name", StringType()), 
          StructField("red_byte_array", BinaryType()),
          StructField("blue_byte_array", BinaryType()),
          StructField("green_byte_array", BinaryType()),          
         ]
spark_image_schema = StructType(fields)

import io
import zlib
import imageio
from PIL import Image

def parquet_generator(spark_row):
    """Creates parquet file from single band image files, with each band compressed and converted to a byte array."""
    image_prefix = spark_row.image_prefix
#     image_prefix = 'S2A_MSIL2A_20180527T093041_44_28'
    def load_and_compress_band(band):
        img = Image.open(f"{root}/data/big_earth/BigEarthNet-v1.0/{image_prefix}/{image_prefix}_B{band}.tif")
        compressed_array = io.BytesIO()
        img.save(compressed_array, format='tiff', compression="tiff_deflate")
        return bytearray(compressed_array.getbuffer())
#     imageio.save(uri=compressed_array, im=img, format="tiff", compression="tiff_deflate")
#     return [x, compressed_array.getvalue()]
    return [image_prefix, load_and_compress_band('02'), load_and_compress_band('03'), load_and_compress_band('04')]



def generate_parquet_dataset_from_image_files(spark, output_url):
    rows_rdd = sc.parallelize(image_prefixes[:5000])\
       .map(parquet_generator)

    spark.createDataFrame(rows_rdd, spark_image_schema) \
       .write \
       .mode('overwrite') \
       .parquet(output_url)
    
start = time.time()  
generate_parquet_dataset_from_image_files(spark, output_url)

print(time.time() - start, 'seconds')
# 0.41523170471191406 seconds

174.52551078796387 seconds


In [31]:
import glob
import pyarrow.parquet as pq

parquet_files = glob.glob(root + "/data/parquet_images/*.parquet")
parquet_file_row_groups = []
total_num_rows = 0
for parquet_file in parquet_files:
    open_parquet_handle = pq.ParquetFile(parquet_file)
    for row_group in range(open_parquet_handle.num_row_groups):
        parquet_file_row_groups.append(
            (open_parquet_handle, row_group)
        )

In [38]:
from tensorflow_io.arrow import ArrowDataset

import sys

def compress_img(parquet_file_row_group):
    parquet_file = parquet_file_row_group[0]
    row_group = parquet_file_row_group[1]
    table = parquet_file.read_row_group(row_group).to_pandas()

    print(sys.getsizeof(table)/1e6)

    def deserialize_to_image(image_bytes):
        return Image.open(io.BytesIO(image_bytes))

    for img_field in ['red_byte_array', 'blue_byte_array', 'green_byte_array']:
        table[img_field] = table[img_field].apply(deserialize_to_image)

    print(sys.getsizeof(table)/1e6)
    return table

import pandas as pd

df = pd.concat([compress_img(parquet_file_row_group) for parquet_file_row_group in parquet_file_row_groups])


134.477395
0.462523
134.493969
0.46483
33.854645
0.116338


In [42]:
sys.getsizeof(df)/1e6

1.083259

In [124]:
def images_from_parquet():
#     queue = tf.RandomShuffleQueue(1000, min_after_dequeue=1, dtypes=tf.float32)
    indexes = [num for num in range(len(parquet_file_row_groups))]
    random.shuffle(indexes)
    for index in indexes:
        parquet_file, row_group = parquet_file_row_groups[index]
        table = parquet_file.read_row_group(row_group)

        def deserialize_to_np(image_bytes):
            return np.frombuffer(image_bytes, dtype=np.uint16).reshape(120, 120, 3)

        df = table.to_pandas()
        df['image_tensor'] = df['image_bytes'].apply(deserialize_to_np)
        
        for index, row in df.iterrows():
            yield row['image_name'], row['image_tensor']
            
def images_from_parquet(index):
    parquet_file, row_group = parquet_file_row_groups[index]
#     queue = tf.RandomShuffleQueue(1000, min_after_dequeue=1, dtypes=tf.float32)
    parquet_file = parquet_file_and_row_group[0]
    row_group = parquet_file_and_row_group[1]
    table = parquet_file.read_row_group(row_group)

    def deserialize_to_np(image_bytes):
        return np.frombuffer(image_bytes, dtype=np.uint16).reshape(120, 120, 3)

    df = table.to_pandas()
    df['image_tensor'] = df['image_bytes'].apply(deserialize_to_np)

    for index, row in df.iterrows():
        yield row['image_name'], row['image_tensor']


indexes = [num for num in range(len(parquet_file_row_groups))]
dataset = tf.data.Dataset.from_tensor_slices(indexes) 
            
def benchmark(dataset, num_epochs=2):
    start_time = time.perf_counter()
    for epoch_num in range(num_epochs):
        for sample in dataset:
            # Performing a training step
            time.sleep(0.01)
    tf.print("Execution time:", time.perf_counter() - start_time)
    
augmented_image_dataset = dataset.interleave(
    images_from_parquet,
    block_length=4,
    num_parallel_calls=4
).prefetch(tf.data.experimental.AUTOTUNE)

benchmark(augmented_image_dataset)



TypeError: Unsupported return value from function passed to Dataset.interleave(): <generator object images_from_parquet at 0x7fac1c1982a0>.

In [42]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

ImageDataGenerator

In [None]:
from petastorm import make_batch_reader
from petastorm.tf_utils import make_petastorm_dataset
from albumentations import (
    Compose, Flip, VerticalFlip, Resize, Rotate, ToFloat
)

augmentations_train = Compose([
    Flip(p=0.5),
    Rotate(limit=(0, 360), p=0.5)
])


def foo(img):
    print(img)
    return img
    
with make_batch_reader(output_url, num_epochs=2) as reader:
    print(dir(reader))
#     dataset = make_petastorm_dataset(reader).map(foo)
#     for data in dataset:
#         break
        
        

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply_predicate_to_row_groups', '_apply_row_group_selector', '_create_ventilator', '_filter_row_groups', '_normalize_shuffle_options', '_partition_row_groups', '_results_queue_reader', '_workers_pool', 'batched_output', 'dataset', 'diagnostics', 'is_batched_reader', 'join', 'last_row_consumed', 'next', 'ngram', 'reset', 'schema', 'stop', 'stopped', 'ventilator']


In [None]:
img