In [13]:
pip install hdfs

Collecting hdfs
  Downloading hdfs-2.5.8.tar.gz (41 kB)
[K     |████████████████████████████████| 41 kB 353 kB/s eta 0:00:011
[?25hCollecting docopt
  Downloading docopt-0.6.2.tar.gz (25 kB)
Installing collected packages: docopt, hdfs
    Running setup.py install for docopt ... [?25ldone
[?25h    Running setup.py install for hdfs ... [?25ldone
[?25hSuccessfully installed docopt-0.6.2 hdfs-2.5.8
You should consider upgrading via the '/scratch/data603_virtualenv/db59735/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
def get_hdfs(namenode = '10.3.0.2', port = 8020):
    """
    Return a HDFS connection. Note: this API is deprecated in pyarrow
    """
    import pyarrow as pa
    import os

    os.environ['HADOOP_HOME'] = f"/opt/cloudera/parcels/CDH"
    os.environ['JAVA_HOME'] = "/usr/java/jdk1.8.0_181-cloudera"
    os.environ['ARROW_LIBHDFS_DIR'] = "/opt/cloudera/parcels/CDH/lib64"
               
    hdfs = pa.hdfs.HadoopFileSystem(namenode, port)
    hdfs.connect()
    
    return hdfs

In [2]:
def get_httpdfs():
    import os
    from hdfs import InsecureClient
    client = InsecureClient('http://10.3.0.2:9870', user=os.environ['USER'])
    
    return client

In [3]:
def get_fs(namenode = '10.3.0.2', port = 8020):
    """
    Returns generic Pyarrow filesystem object connected to HDFS.
    
    example usage: hdfs.get_target_stats(fs.FileSelector('/data/google_open_image', recursive = True))
    
    This is the hdfs interface going forward as the other is deprecated.
    """
    from pyarrow import fs
    
    hdfs_options = fs.HdfsOptions(endpoint = (namenode, port), driver = 'libhdfs')
    hdfs = fs.HadoopFileSystem(hdfs_options)
    
    return hdfs

In [4]:
import import_ipynb
from data603 import HDFS

httpdfs = get_httpdfs()
httpdfs.list('/data/keras_models')

importing Jupyter notebook from /scratch/data603/group5/data603/HDFS.ipynb


['densenet',
 'efficientnet',
 'inception_resnet_v2',
 'inception_v3',
 'mobilenet',
 'mobilenet_v2',
 'mobilenet_v3',
 'nasnet',
 'resnet',
 'vgg16',
 'vgg19',
 'xception']

In [5]:
httpdfs.list('/data/keras_models/resnet')

['resnet101_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet101_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnet101v2_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet101v2_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnet152_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet152_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnet152v2_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet152v2_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnet50_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnet50v2_weights_tf_dim_ordering_tf_kernels.h5',
 'resnet50v2_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnext101_weights_tf_dim_ordering_tf_kernels.h5',
 'resnext101_weights_tf_dim_ordering_tf_kernels_notop.h5',
 'resnext50_weights_tf_dim_ordering_tf_kernels.h5',
 'resnext50_weights_tf_dim_ordering_tf_kernels_notop.h5']

In [6]:
from urllib.request import urlopen
# Create a local directory
import os
keras_data = './keras_data'
if(not os.path.exists(keras_data)):
    os.mkdir(keras_data)


url = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'

req = urlopen("http://www.google.com/").read()


with open('./keras_data/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5', 'wb') as f:
    f.write(req)

In [7]:
import import_ipynb
from data603 import SparkLauncher

# get a configuration object
conf = SparkLauncher.get_spark_conf()

# add a file to the configuration that will get copied to all the nodes on the cluster
conf.set('spark.yarn.dist.files', './keras_data/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5')
conf.set('spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT', '1')
conf.set('spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT', '1')
conf.set('spark.sql.execution.arrow.enabled', 'true')
conf.set('spark.dynamicAllocation.minExecutors', '1')

conf.set('spark.dynamicAllocation.maxExecutors', '7')

conf.set('spark.executor.cores', '16')

conf.set('spark.executor.memory', '30g')
#conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')

# launch the cluster using the configuration
spark = SparkLauncher.get_spark_session(pack_venv = True, conf = conf)

importing Jupyter notebook from /scratch/data603/group5/data603/SparkLauncher.ipynb
Creating Spark Configuration
Creating Spark Configuration
Packing Virtual Environment: db59735.tar.gz
Setting Environment Variables
Creating Spark Session: db59735_data603_spark_session


In [8]:
import os
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [9]:
#from pyspark.sql.types import *

labels = spark.read.csv('/data/google_open_image/metadata/class-descriptions-boxable.csv', 
                        header = False,
                        schema = StructType([StructField("LabelName", StringType()), 
                                             StructField("LabelText", StringType())]) )

In [10]:
labels = labels.filter("LabelText = 'Lavender (Plant)' OR LabelText = 'Common sunflower' OR LabelText = 'Rose' OR LabelText = 'Lily' ")

labels.show()

+---------+----------------+
|LabelName|       LabelText|
+---------+----------------+
| /m/04gth|Lavender (Plant)|
| /m/06m11|            Rose|
| /m/0ftb8|Common sunflower|
| /m/0jqgx|            Lily|
+---------+----------------+



In [11]:
# Define a schema for the data so the Confidence is a number, not a string
label_schema = StructType([
    StructField("ImageID", StringType()),
    StructField("Source", StringType()),
    StructField("LabelName", StringType()),
    StructField("Confidence", DoubleType())
])

# Read in the csv files using the schema
image_labels_1 = spark.read\
                    .csv('/data/google_open_image/labels/test-annotations-human-imagelabels-boxable.csv', 
                        header = True,
                        schema = label_schema)
image_labels_2 = spark.read\
                    .csv('/data/google_open_image/labels/train-annotations-human-imagelabels-boxable.csv', 
                        header = True,
                        schema = label_schema)
image_labels_3 = spark.read\
                    .csv('/data/google_open_image/labels/validation-annotations-human-imagelabels-boxable.csv', 
                        header = True,
                        schema = label_schema)

# join the 3 files into one large dataframe
image_labels = image_labels_1.union(image_labels_2).union(image_labels_3)

In [12]:
# join and filter
image_labels = image_labels.join(labels, on = 'LabelName', how = 'right')\
                .filter("Confidence > 0.99")

In [13]:
# distinct image IDs.
image_ids = image_labels.filter("Confidence > 0.99").select('ImageID').distinct()

In [14]:
# Check how many images there are.
image_ids.count()

Py4JJavaError: An error occurred while calling o675.count.
: org.apache.spark.SparkException: Could not execute broadcast in 300 secs. You can increase the timeout for broadcasts via spark.sql.broadcastTimeout or disable broadcast join by setting spark.sql.autoBroadcastJoinThreshold to -1
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:150)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:375)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:144)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:140)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:140)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:136)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenInner(BroadcastHashJoinExec.scala:234)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:103)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:362)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:362)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:98)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduceWithKeys(HashAggregateExec.scala:646)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduce(HashAggregateExec.scala:166)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.produce(HashAggregateExec.scala:40)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:532)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:586)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:92)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:379)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:151)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:151)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:615)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:92)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:379)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:151)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:615)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:296)
	at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2830)
	at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2829)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.count(Dataset.scala:2829)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [300 seconds]
	at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:223)
	at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:227)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:220)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:146)
	... 115 more


In [14]:
images_parquet = spark.read.parquet('/etl/google_open_image/images_coalesced.parquet')

In [15]:
# there's a lot of columns that aren't needed, select just the ones of interest.
images_parquet = images_parquet.select(['ImageID', 'Subset', 'Data'])\
                .withColumn("ImageID", F.lower(F.col('ImageID')))

In [16]:
# Verify the column names in the dataframe
images_parquet

DataFrame[ImageID: string, Subset: string, Data: binary]

In [17]:
images_parquet = image_ids.join(images_parquet, on = 'ImageID', how = 'left')

In [18]:
# Verify the dataframe
#images_parquet.count()

In [19]:
# Read the 3 bounding box csv files.
bounding_boxes_1 = spark.read.csv('/data/google_open_image/bboxes/test-annotations-bbox.csv', header = True)
bounding_boxes_2 = spark.read.csv('/data/google_open_image/bboxes/train-annotations-bbox.csv', header = True)
bounding_boxes_3 = spark.read.csv('/data/google_open_image/bboxes/validation-annotations-bbox.csv', header = True)

# Join the dataframes into a single dataframe.
bounding_boxes = bounding_boxes_1.union(bounding_boxes_2).union(bounding_boxes_3)

In [20]:
# Join on ImageID to get just the bounding boxes we have image data for.
bbs = image_ids.join(bounding_boxes, on = 'ImageID', how = 'left')

In [21]:
# Join in the labels so there are human-readable labels on the bounding boxes.
bbs = labels.join(bbs, on = 'LabelName', how = 'left')

In [22]:
# Check how many boxes there are.
#bbs.count()

In [23]:
image_chips = images_parquet.join(bbs, on = 'ImageID', how = 'right')

In [24]:
# Check how many chips we have to make sure the join was the correct one.
#image_chips.count()

In [25]:
def extract_chip(data, xmin, xmax, ymin, ymax):
    from PIL import Image
    import io, math
    
    # Read the image data using Pillow
    img = Image.open(io.BytesIO(data))
    # Get the size of the image 
    (width, height) = img.size
    
    # Calculate the bounding box pixels
    # observe the use of float function here. That's necessary
    # because the bounding box data were read in as strings, not doubles.
    left = math.floor(float(xmin)*width)
    upper = math.floor(float(ymin)*height)
    right = math.floor(float(xmax)*width)
    lower = math.floor(float(ymax)*height)
    
    # Crop the image to the bounding box size
    img = img.crop(box = (left, upper, right, lower))
    
    # Save the image to a byte-buffer
    buff = io.BytesIO()
    img.save(buff, format = "JPEG")
    
    # Get the raw bytes of the jpeg data.
    byte_array = buff.getvalue()
    return byte_array   # return buff.getvalue() doesn't work. This a quirk of pyspark not being able to determine the output type of a function call.

# Wrap the function as a spark udf (user-defined function) with a binary return type
udf_extract_chip = F.udf(extract_chip, returnType = BinaryType())

# Create a new column with the image chip data
image_chips = image_chips.withColumn("chip_data", udf_extract_chip("Data","XMin","XMax","YMin","YMax"))

In [26]:
image_chips

DataFrame[ImageID: string, Subset: string, Data: binary, LabelName: string, LabelText: string, Source: string, Confidence: string, XMin: string, XMax: string, YMin: string, YMax: string, IsOccluded: string, IsTruncated: string, IsGroupOf: string, IsDepiction: string, IsInside: string, chip_data: binary]

In [27]:
image_chips=image_chips.drop('IsOccluded','IsGroupOf','Source','IsTruncated','IsDepiction','IsInside','Data','XMin','XMax','YMin','YMax','Confidence','Subset')

In [28]:
image_chips

DataFrame[ImageID: string, LabelName: string, LabelText: string, chip_data: binary]

In [29]:
image_chips.show(2)

KeyboardInterrupt: 

In [29]:
#image_chips = image_chips.drop('data')

image_chips.write.parquet("/user/db59735/image_chips_final.parquet")

Py4JJavaError: An error occurred while calling o839.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:557)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: 
Aborting TaskSet 6.0 because task 1278 (partition 1278)
cannot run anywhere due to node and executor blacklist.
Most recent failure:
Lost task 1278.1 in stage 6.0 (TID 3311, worker1.hdp-internal, executor 20): ExecutorLostFailure (executor 20 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits.  35.2 GB of 32 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead or disabling yarn.nodemanager.vmem-check-enabled because of YARN-4714.

Blacklisting behavior can be configured via spark.blacklist.*.

	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
	... 33 more


In [40]:
# write chip to hdfs
def write_chip_hdfs(data, id, label):
    import io
    from random import randint
    
    from hdfs import InsecureClient
    client = InsecureClient('http://10.3.0.2:9870', user='db59735')
    
    filename = f"{label}_{id}_{randint(0,1000000)}.jpeg"
    path = "/user/db59735/keras_chips_final/" + filename
    client.write(path, io.BytesIO(data))
    
    return path

# wrap function in a udf
udf_write_chip_hdfs = F.udf(write_chip_hdfs, StringType())

In [41]:
image_chips = image_chips.withColumn("hdfs_path", udf_write_chip_hdfs("chip_data", "ImageID", "LabelText"))

In [42]:
image_chips.limit(10).toPandas()

Py4JJavaError: An error occurred while calling o932.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: 
Aborting TaskSet 40.0 because task 529 (partition 529)
cannot run anywhere due to node and executor blacklist.
Most recent failure:
Lost task 529.1 in stage 40.0 (TID 8595, worker7.hdp-internal, executor 71): ExecutorLostFailure (executor 71 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits.  30.3 GB of 27 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead or disabling yarn.nodemanager.vmem-check-enabled because of YARN-4714.

Blacklisting behavior can be configured via spark.blacklist.*.

	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2102)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2121)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3257)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3254)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3254)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [33]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['A', 'B', 'C', 'D'])
df


Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [None]:
df=df.drop('')