In [1]:
!pip install pyarrow==0.15.*



In [190]:
!pip install albumentations tensorflow tensorflow-io imageio Pillow



In [34]:
import os
import random
import time

import numpy as np
import pyarrow as pa

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
import tensorflow as tf

root = '/home/jovyan/work'
# Based on `core-site.xml` in https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md doc
spark = SparkSession.builder \
    .master("local") \
    .appName("big_earth") \
    .config("spark.driver.extraClassPath", root + "/spark_dependencies/gcs-connector-hadoop2-latest.jar") \
    .config("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .config("fs.gs.project.id", "big-earth-252219") \
    .config("google.cloud.auth.service.account.enable", "true") \
    .config("google.cloud.auth.service.account.json.keyfile", root + "/.gcs/big-earth-252219-fb2e5c109f78.json") \
    .getOrCreate()
sc = spark.sparkContext

In [191]:
output_url = 'file://' + root + '/data/hello_world_dataset'
fields = [StructField("image_name", StringType()), StructField("image_bytes", BinaryType())]
spark_image_schema = StructType(fields)

import io
import zlib
import imageio
from PIL import Image

def row_generator(x):
    """Returns a single entry in the generated dataset. Return a bunch of random values as an example."""
    img = Image.open(f"{root}/data/{x}/{x}_B02.tif")
#     arr =  np.random.randint(0, 4000, dtype=np.uint16, size=(120, 120, 3))
#     compressed_array = io.BytesIO()    # np.savez_compressed() requires a file-like object to write to
#     np.savez_compressed(compressed_array, arr)
    return [x, bytearray(img.tobytes())]


def generate_hello_world_dataset(spark, output_url):
#     rows_count = 1000
#     filenames = ["name_{}".format(id) for id in range(rows_count)]
    rowgroup_size_mb = 128

    rows_rdd = sc.parallelize(['S2A_MSIL2A_20170613T101031_6_59'])\
       .map(row_generator)

    spark.createDataFrame(rows_rdd, spark_image_schema) \
       .coalesce(10) \
       .write \
       .mode('overwrite') \
       .parquet(output_url)
    
    
generate_hello_world_dataset(spark, output_url)

In [192]:
os.listdir(root + "/data/hello_world_dataset")

['.part-00000-8a3db4d7-bd25-4b72-a2b1-4e3645ff4e7b-c000.snappy.parquet.crc',
 '._SUCCESS.crc',
 '_SUCCESS',
 'part-00000-8a3db4d7-bd25-4b72-a2b1-4e3645ff4e7b-c000.snappy.parquet']

In [193]:
import pyarrow.parquet as pq

parquet_files = [root + "/data/hello_world_dataset/part-00000-8a3db4d7-bd25-4b72-a2b1-4e3645ff4e7b-c000.snappy.parquet"]
parquet_file_row_groups = []
total_num_rows = 0
for parquet_file in parquet_files:
    open_parquet_handle = pq.ParquetFile(parquet_file)
    for row_group in range(open_parquet_handle.num_row_groups):
        parquet_file_row_groups.append(
            (open_parquet_handle, row_group)
        )

In [202]:
from tensorflow_io.arrow import ArrowDataset

parquet_file, row_group = parquet_file_row_groups[0]
table = parquet_file.read_row_group(row_group).to_pandas()
import sys
print(sys.getsizeof(table)/1e6)

def deserialize_to_image(image_bytes):
    return Image.frombytes(mode='I;16', size=(120, 120), data=image_bytes)

table['image_bytes'] = table['image_bytes'].apply(deserialize_to_image)

print(sys.getsizeof(table)/1e6)

np.array(table.iloc[0]['image_bytes'])


0.029081
0.00028


In [124]:
def images_from_parquet():
#     queue = tf.RandomShuffleQueue(1000, min_after_dequeue=1, dtypes=tf.float32)
    indexes = [num for num in range(len(parquet_file_row_groups))]
    random.shuffle(indexes)
    for index in indexes:
        parquet_file, row_group = parquet_file_row_groups[index]
        table = parquet_file.read_row_group(row_group)

        def deserialize_to_np(image_bytes):
            return np.frombuffer(image_bytes, dtype=np.uint16).reshape(120, 120, 3)

        df = table.to_pandas()
        df['image_tensor'] = df['image_bytes'].apply(deserialize_to_np)
        
        for index, row in df.iterrows():
            yield row['image_name'], row['image_tensor']
            
def images_from_parquet(index):
    parquet_file, row_group = parquet_file_row_groups[index]
#     queue = tf.RandomShuffleQueue(1000, min_after_dequeue=1, dtypes=tf.float32)
    parquet_file = parquet_file_and_row_group[0]
    row_group = parquet_file_and_row_group[1]
    table = parquet_file.read_row_group(row_group)

    def deserialize_to_np(image_bytes):
        return np.frombuffer(image_bytes, dtype=np.uint16).reshape(120, 120, 3)

    df = table.to_pandas()
    df['image_tensor'] = df['image_bytes'].apply(deserialize_to_np)

    for index, row in df.iterrows():
        yield row['image_name'], row['image_tensor']


indexes = [num for num in range(len(parquet_file_row_groups))]
dataset = tf.data.Dataset.from_tensor_slices(indexes) 
            
def benchmark(dataset, num_epochs=2):
    start_time = time.perf_counter()
    for epoch_num in range(num_epochs):
        for sample in dataset:
            # Performing a training step
            time.sleep(0.01)
    tf.print("Execution time:", time.perf_counter() - start_time)
    
augmented_image_dataset = dataset.interleave(
    images_from_parquet,
    block_length=4,
    num_parallel_calls=4
).prefetch(tf.data.experimental.AUTOTUNE)

benchmark(augmented_image_dataset)



TypeError: Unsupported return value from function passed to Dataset.interleave(): <generator object images_from_parquet at 0x7fac1c1982a0>.

In [42]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

ImageDataGenerator

In [None]:
from petastorm import make_batch_reader
from petastorm.tf_utils import make_petastorm_dataset
from albumentations import (
    Compose, Flip, VerticalFlip, Resize, Rotate, ToFloat
)

augmentations_train = Compose([
    Flip(p=0.5),
    Rotate(limit=(0, 360), p=0.5)
])


def foo(img):
    print(img)
    return img
    
with make_batch_reader(output_url, num_epochs=2) as reader:
    print(dir(reader))
#     dataset = make_petastorm_dataset(reader).map(foo)
#     for data in dataset:
#         break
        
        

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply_predicate_to_row_groups', '_apply_row_group_selector', '_create_ventilator', '_filter_row_groups', '_normalize_shuffle_options', '_partition_row_groups', '_results_queue_reader', '_workers_pool', 'batched_output', 'dataset', 'diagnostics', 'is_batched_reader', 'join', 'last_row_consumed', 'next', 'ngram', 'reset', 'schema', 'stop', 'stopped', 'ventilator']


In [None]:
img