# PreProcessing Google Open Image Data Set

In [1]:
#Connecting to Spark and HDFS
def get_hdfs(namenode = '10.3.0.2', port = 8020):
    """
    Return a HDFS connection. Note: this API is deprecated in pyarrow
    """
    import pyarrow as pa
    import os

    os.environ['HADOOP_HOME'] = f"/opt/cloudera/parcels/CDH"
    os.environ['JAVA_HOME'] = "/usr/java/jdk1.8.0_181-cloudera"
    os.environ['ARROW_LIBHDFS_DIR'] = "/opt/cloudera/parcels/CDH/lib64"
               
    hdfs = pa.hdfs.HadoopFileSystem(namenode, port)
    hdfs.connect()
    
    return hdfs

In [2]:
def get_httpdfs():
    import os
    from hdfs import InsecureClient
    client = InsecureClient('http://10.3.0.2:9870', user=os.environ['USER'])
    
    return client

In [3]:
def get_fs(namenode = '10.3.0.2', port = 8020):
    """
    Returns generic Pyarrow filesystem object connected to HDFS.
    
    example usage: hdfs.get_target_stats(fs.FileSelector('/data/google_open_image', recursive = True))
    
    This is the hdfs interface going forward as the other is deprecated.
    """
    from pyarrow import fs
    
    hdfs_options = fs.HdfsOptions(endpoint = (namenode, port), driver = 'libhdfs')
    hdfs = fs.HadoopFileSystem(hdfs_options)
    
    return hdfs

In [4]:
import import_ipynb
from data603 import HDFS

httpdfs = get_httpdfs()
httpdfs.list('/data/keras_models')

importing Jupyter notebook from /scratch/data603/has1/data603/HDFS.ipynb


['densenet',
 'efficientnet',
 'inception_resnet_v2',
 'inception_v3',
 'mobilenet',
 'mobilenet_v2',
 'mobilenet_v3',
 'nasnet',
 'resnet',
 'vgg16',
 'vgg19',
 'xception']

In [5]:
httpdfs.list('/data/keras_models/resnet')

['resnet101_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet101_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnet101v2_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet101v2_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnet152_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet152_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnet152v2_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet152v2_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnet50_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnet50v2_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet50v2_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnext101_weights_tf_dim_ordering_tf_kernels.h5',
 'resnext101_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnext50_weights_tf_dim_ordering_tf_kernels.h5',
 'resnext50_weights_tf_dim_ordering_tf_kernels_notop.h5']

In [6]:
from urllib.request import urlopen
# Create a local directory
import os
keras_data = './keras_data'
if(not os.path.exists(keras_data)):
    os.mkdir(keras_data)

#download file from hdfs
 
url = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'

req = urlopen("http://www.google.com/").read()


with open('./keras_data/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5', 'wb') as f:
    f.write(req)

In [7]:
import import_ipynb
from data603 import SparkLauncher

# get a configuration object
conf = SparkLauncher.get_spark_conf()

# add a file to the configuration that will get copied to all the nodes on the cluster
conf.set('spark.yarn.dist.files', './keras_data/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5')
conf.set('spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT', '1')
conf.set('spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT', '1')
conf.set('spark.sql.execution.arrow.enabled', 'true')
conf.set('spark.dynamicAllocation.minExecutors', '1')

conf.set('spark.dynamicAllocation.maxExecutors', '7')

conf.set('spark.executor.cores', '16')

conf.set('spark.executor.memory', '50g')
#conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')

# launch the cluster using the configuration
spark = SparkLauncher.get_spark_session(pack_venv = True, conf = conf)

importing Jupyter notebook from /scratch/data603/has1/data603/SparkLauncher.ipynb
Creating Spark Configuration
Creating Spark Configuration
Packing Virtual Environment: has1.tar.gz
Setting Environment Variables
Creating Spark Session: has1_data603_spark_session


In [8]:
import os
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [9]:
#from pyspark.sql.types import *
#Reading image label
labels = spark.read.csv('/data/google_open_image/metadata/class-descriptions-boxable.csv', 
                        header = False,
                        schema = StructType([StructField("LabelName", StringType()), 
                                             StructField("LabelText", StringType())]) )

In [10]:
#Filter out to only big cats within the labels 
labels = labels.filter("LabelText = 'Tiger' OR LabelText = 'Lion' OR LabelText = 'Cheetah' OR LabelText = 'Leopard' OR LabelText = 'Jaguar (Animal)' OR LabelText = 'Lynx' ")

labels.show()

+---------+---------------+
|LabelName|      LabelText|
+---------+---------------+
| /m/0449p|Jaguar (Animal)|
| /m/04g2r|           Lynx|
| /m/07dm6|          Tiger|
| /m/096mb|           Lion|
| /m/0c29q|        Leopard|
| /m/0cd4d|        Cheetah|
+---------+---------------+



In [11]:
# Define a schema for the data so the Confidence is a number, not a string
label_schema = StructType([
    StructField("ImageID", StringType()),
    StructField("Source", StringType()),
    StructField("LabelName", StringType()),
    StructField("Confidence", DoubleType())
])

# Read in the csv files using the schema
image_labels_1 = spark.read\
                    .csv('/data/google_open_image/labels/test-annotations-human-imagelabels-boxable.csv', 
                        header = True,
                        schema = label_schema)
image_labels_2 = spark.read\
                    .csv('/data/google_open_image/labels/train-annotations-human-imagelabels-boxable.csv', 
                        header = True,
                        schema = label_schema)
image_labels_3 = spark.read\
                    .csv('/data/google_open_image/labels/validation-annotations-human-imagelabels-boxable.csv', 
                        header = True,
                        schema = label_schema)

# join the 3 files into one large dataframe
image_labels = image_labels_1.union(image_labels_2).union(image_labels_3)

In [12]:
# join and filter
image_labels = image_labels.join(labels, on = 'LabelName', how = 'right')\
                .filter("Confidence > 0.99")

In [13]:
# distinct image IDs.
image_ids = image_labels.filter("Confidence > 0.99").select('ImageID').distinct()

In [14]:
# Check how many images there are.
image_ids.count()

3939

In [15]:
#Read in the raw image data
images_parquet = spark.read.parquet('/etl/google_open_image/images_coalesced.parquet')

In [16]:
# there's a lot of columns that aren't needed, select just the ones of interest.
images_parquet = images_parquet.select(['ImageID', 'Subset', 'Data'])\
                .withColumn("ImageID", F.lower(F.col('ImageID')))

In [17]:
# Verify the column names in the dataframe
images_parquet

DataFrame[ImageID: string, Subset: string, Data: binary]

In [18]:
#Joining image ids to the parquet images to filter out things we dont want
images_parquet = image_ids.join(images_parquet, on = 'ImageID', how = 'left')

In [19]:
# Verify the dataframe
images_parquet.count()

In [20]:
# Read the 3 bounding box csv files.
bounding_boxes_1 = spark.read.csv('/data/google_open_image/bboxes/test-annotations-bbox.csv', header = True)
bounding_boxes_2 = spark.read.csv('/data/google_open_image/bboxes/train-annotations-bbox.csv', header = True)
bounding_boxes_3 = spark.read.csv('/data/google_open_image/bboxes/validation-annotations-bbox.csv', header = True)

# Join the dataframes into a single dataframe.
bounding_boxes = bounding_boxes_1.union(bounding_boxes_2).union(bounding_boxes_3)

In [21]:
# Join on ImageID to get just the bounding boxes we have image data for.
bbs = image_ids.join(bounding_boxes, on = 'ImageID', how = 'left')

In [22]:
# Join in the labels so there are human-readable labels on the bounding boxes.
bbs = labels.join(bbs, on = 'LabelName', how = 'left')

In [23]:
# Check how many boxes there are.
bbs.count()

In [24]:
image_chips = images_parquet.join(bbs, on = 'ImageID', how = 'right')

In [25]:
# Check how many chips we have to make sure the join was the correct one.
image_chips.count()

In [26]:
def extract_chip(data, xmin, xmax, ymin, ymax):
    from PIL import Image
    import io, math
    
    # Read the image data using Pillow
    img = Image.open(io.BytesIO(data))
    # Get the size of the image 
    (width, height) = img.size
    
    # Calculate the bounding box pixels
    # observe the use of float function here. That's necessary
    # because the bounding box data were read in as strings, not doubles.
    left = math.floor(float(xmin)*width)
    upper = math.floor(float(ymin)*height)
    right = math.floor(float(xmax)*width)
    lower = math.floor(float(ymax)*height)
    
    # Crop the image to the bounding box size
    img = img.crop(box = (left, upper, right, lower))
    
    # Save the image to a byte-buffer
    buff = io.BytesIO()
    img.save(buff, format = "JPEG")
    
    # Get the raw bytes of the jpeg data.
    byte_array = buff.getvalue()
    return byte_array   # return buff.getvalue() doesn't work. This a quirk of pyspark not being able to determine the output type of a function call.

# Wrap the function as a spark udf (user-defined function) with a binary return type
udf_extract_chip = F.udf(extract_chip, returnType = BinaryType())

# Create a new column with the image chip data
image_chips = image_chips.withColumn("chip_data", udf_extract_chip("Data","XMin","XMax","YMin","YMax"))

In [27]:
image_chips

DataFrame[ImageID: string, Subset: string, Data: binary, LabelName: string, LabelText: string, Source: string, Confidence: string, XMin: string, XMax: string, YMin: string, YMax: string, IsOccluded: string, IsTruncated: string, IsGroupOf: string, IsDepiction: string, IsInside: string, chip_data: binary]

In [28]:
#Drop all useless feilds to save space 
image_chips=image_chips.drop('IsOccluded','IsGroupOf','Source','IsTruncated','IsDepiction','IsInside','Data','XMin','XMax','YMin','YMax','Subset')

In [29]:
image_chips

DataFrame[ImageID: string, LabelName: string, LabelText: string, chip_data: binary]

In [30]:
#ran out of memory when running this. its fine
image_chips.show()

+----------------+---------+---------+--------------------+
|         ImageID|LabelName|LabelText|           chip_data|
+----------------+---------+---------+--------------------+
|09708a3c4d623287| /m/096mb|     Lion|[FF D8 FF E0 00 1...|
|12b5606107c34487| /m/096mb|     Lion|[FF D8 FF E0 00 1...|
|1321281d74240f59| /m/07dm6|    Tiger|[FF D8 FF E0 00 1...|
|1321281d74240f59| /m/07dm6|    Tiger|[FF D8 FF E0 00 1...|
|178f377c72af95ba| /m/04g2r|     Lynx|[FF D8 FF E0 00 1...|
|2da1984dacb751c1| /m/07dm6|    Tiger|[FF D8 FF E0 00 1...|
|3a831a9f7c0e5b36| /m/096mb|     Lion|[FF D8 FF E0 00 1...|
|3f11b18d2f35089a| /m/0cd4d|  Cheetah|[FF D8 FF E0 00 1...|
|3f11b18d2f35089a| /m/0cd4d|  Cheetah|[FF D8 FF E0 00 1...|
|4ed7e504af1e8c92| /m/0c29q|  Leopard|[FF D8 FF E0 00 1...|
|4ed7e504af1e8c92| /m/0cd4d|  Cheetah|[FF D8 FF E0 00 1...|
|720d627bc86e7d74| /m/04g2r|     Lynx|[FF D8 FF E0 00 1...|
|78f333185baec190| /m/0c29q|  Leopard|[FF D8 FF E0 00 1...|
|78f333185baec190| /m/0c29q|  Leopard|[F

In [31]:
# write chip to hdfs
def write_chip_hdfs(data, id, label):
    import io
    from random import randint
    
    from hdfs import InsecureClient
    client = InsecureClient('http://10.3.0.2:9870', user='has1')
    
    filename = f"{label}_{id}_{randint(0,1000000)}.jpeg"
    path = "/user/has1/write_chips_final/" + filename
    client.write(path, io.BytesIO(data))
    
    return path

# wrap function in a udf
udf_write_chip_hdfs = F.udf(write_chip_hdfs, StringType())

In [32]:
#Dont really need this
image_chips = image_chips.withColumn("hdfs_path", udf_write_chip_hdfs("chip_data", "ImageID", "LabelText"))

In [33]:
image_chips.write.mode("overwrite").parquet("/user/has1/chips_image.parquet")

In [1]:
spark.stop()

NameError: name 'spark' is not defined