# ResNet50 With Pandas Python UDFs




# Launch Spark

Three configuration items have to be added to the Spark configuration to enable Arrow as it is disabled by default. This can be done without modifying SparkLauncher now, but you can just modify that if you like.

```python
    # Apache Arrow Config
    conf.set('spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT', '1')
    conf.set('spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT', '1')
    conf.set('spark.sql.execution.arrow.enabled', 'true')
```

In [0]:
!pip install keras  --no-cache-dir 
!pip install tensorflow --no-cache-dir



In [0]:
import import_ipynb
from data603 import SparkLauncher
from data603 import HDFS
#import extra libraries
import io
from io import StringIO, BytesIO

# get a configuration object
conf = SparkLauncher.get_spark_conf()

# add a file to the configuration that will get copied to all the nodes on the cluster
conf.set('spark.yarn.dist.files', 'keras_data/resnet50_weights_tf_dim_ordering_tf_kernels.h5')

# Apache Arrow Config
conf.set('spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT', '1')
conf.set('spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT', '1')
conf.set('spark.sql.execution.arrow.enabled', 'true')

# launch the cluster using the configuration
spark = SparkLauncher.get_spark_session(pack_venv = False, conf = conf)
hdfs = HDFS.get_hdfs()

importing Jupyter notebook from /scratch/data603/has1/data603/SparkLauncher.ipynb
Creating Spark Configuration
importing Jupyter notebook from /scratch/data603/has1/data603/HDFS.ipynb
Creating Spark Configuration
Setting Environment Variables
Creating Spark Session: has1_data603_spark_session


# Read Dataframe

This must be done _BEFORE_ the UDF is defined because the UDF needs the schemas of the dataframes it will be using.For this section I have a parquet file written out with the bounding boxes extracted of several cat types.

In [0]:
image_chips = spark.read.parquet("/user/has1/chips_image.parquet")
image_chips = image_chips.drop('data') # remove the full-image data.

In [0]:
image_chips.count()

6156

In [0]:
len(image_chips.columns)

5

In [0]:
#Checking to see its there
image_chips.show(10)


+----------------+---------+---------------+--------------------+--------------------+
|         ImageID|LabelName|      LabelText|           chip_data|           hdfs_path|
+----------------+---------+---------------+--------------------+--------------------+
|025d25975e4275a2| /m/0c29q|        Leopard|[FF D8 FF E0 00 1...|/user/has1/write_...|
|025d25975e4275a2| /m/0cd4d|        Cheetah|[FF D8 FF E0 00 1...|/user/has1/write_...|
|025d25975e4275a2| /m/0449p|Jaguar (Animal)|[FF D8 FF E0 00 1...|/user/has1/write_...|
|03bacd7be83b721e| /m/096mb|           Lion|[FF D8 FF E0 00 1...|/user/has1/write_...|
|078bfcf1afb210ae| /m/096mb|           Lion|[FF D8 FF E0 00 1...|/user/has1/write_...|
|078bfcf1afb210ae| /m/096mb|           Lion|[FF D8 FF E0 00 1...|/user/has1/write_...|
|078bfcf1afb210ae| /m/096mb|           Lion|[FF D8 FF E0 00 1...|/user/has1/write_...|
|0c9f40ea3014c553| /m/096mb|           Lion|[FF D8 FF E0 00 1...|/user/has1/write_...|
|0e0e38e4ffb1b727| /m/0c29q|        Leopard

In [0]:
image_chips.printSchema()

root
 |-- ImageID: string (nullable = true)
 |-- LabelName: string (nullable = true)
 |-- LabelText: string (nullable = true)
 |-- chip_data: binary (nullable = true)
 |-- hdfs_path: string (nullable = true)



# Add In a Grouping Column



In [0]:
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col, lit, udf
from pyspark.sql.types import IntegerType

def group_id(n):
    ret = n % 10
    return ret

udf_group_id = udf(group_id, IntegerType())

# create the counter 1 - # of rows
image_chips = image_chips.withColumn("n", monotonically_increasing_id())

# modulo the counter to get a repeating pattern of 0,1,2,3,4,5,6,7,8,9 for the group number
image_chips = image_chips.withColumn("grp", udf_group_id("n"))

# Create Ouput Column(s)



In [0]:
# create two empty columns for result of udf
image_chips = image_chips.withColumn('prediction_label', lit(""))
image_chips = image_chips.withColumn('prediction_confidence', lit(0.0))

# Create the Pandas UDF



In [0]:
import pandas as pd

from pyspark.sql.functions import pandas_udf, PandasUDFType

@pandas_udf(image_chips.schema, PandasUDFType.GROUPED_MAP)
def evaluate_chip(pdf):
    # pdf is a pandas dataframe
    import io
    import os
    from keras.applications.resnet50 import ResNet50
    from keras.applications.resnet50 import preprocess_input
    from keras.applications.resnet50 import decode_predictions
    from keras.preprocessing.image import load_img
    from keras.preprocessing.image import img_to_array

    # Load Model Data
    model = ResNet50(weights = f'{os.getcwd()}/resnet50_weights_tf_dim_ordering_tf_kernels.h5',
                 include_top = True)

    # Create arrays to hold prediction outputs.
    prediction_label = []
    prediction_confidence = []
    for chip_data in pdf['chip_data']:

        # Load the image
        img = load_img(io.BytesIO(chip_data), target_size = (224,224))

        # Prepare Image
        image = img_to_array(img)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)

        # Run prediction
        yhat = model.predict(image)

        # Decode Predictions
        label = decode_predictions(yhat)
        label = label[0][0]  # Get first prediction (most confident)
        
        # Save prediction results in arrays
        prediction_label.append(label[1])
        prediction_confidence.append(label[2])

    # Assign result array data to the correct columns in the pandas dataframe
    pdf['prediction_label'] = prediction_label
    pdf['prediction_confidence'] = prediction_confidence
    
    return pdf

# Group the Image Dataframe, Apply Pandas UDF

Using the group column to separate the data into processing chunks, call `apply` on each chunk to apply the Pandas UDF.

In [0]:
image_chips = image_chips.groupby('grp').apply(evaluate_chip)

# View the Result!

Since Spark does lazy evaluation, this next line will take some time to process, but if we've done everything right, we shouldn't blow up the cluster memory limits.

In [0]:
image_chips.select(['prediction_label', 'prediction_confidence']).show(100)
#image_chips.select(['prediction_label', 'prediction_confidence'])

Py4JJavaError: An error occurred while calling o527.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: 
Aborting TaskSet 19.0 because task 82 (partition 107)
cannot run anywhere due to node and executor blacklist.
Most recent failure:
Lost task 82.0 in stage 19.0 (TID 201, worker8.hdp-internal, executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in main
    process()
  File "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 367, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 283, in dump_stream
    for series in iterator:
  File "<string>", line 1, in <lambda>
  File "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 108, in wrapped
    result = f(pd.concat(value_series, axis=1))
  File "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-9-ab7e5a255fc4>", line 26, in evaluate_chip
  File "/data/hadoop/drive-m/yarn/nm/usercache/has1/appcache/application_1587490087127_1328/container_1587490087127_1328_01_000003/has1/lib64/python3.6/site-packages/keras_preprocessing/image/utils.py", line 113, in load_img
    with open(path, 'rb') as f:
TypeError: expected str, bytes or os.PathLike object, not _io.BytesIO

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:172)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:624)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:407)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1363)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:413)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)


Blacklisting behavior can be configured via spark.blacklist.*.

	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2102)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2121)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2758)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


# Analyze the Result!

At this point the `image_chips` dataframe is any other Spark dataframe, so process it accordingly.

In [0]:
ic_summary = image_chips.filter('prediction_confidence > 0.90')\
                        .groupby('prediction_label')\
                        .count()\
                        .filter("count > 100")\
                        .sort(col("count").desc())

In [0]:
ic_summary.show()

Py4JJavaError: An error occurred while calling o584.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: 
Aborting TaskSet 21.0 because task 107 (partition 107)
cannot run anywhere due to node and executor blacklist.
Most recent failure:
Lost task 102.1 in stage 21.0 (TID 503, worker8.hdp-internal, executor 4): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in main
    process()
  File "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 367, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 283, in dump_stream
    for series in iterator:
  File "<string>", line 1, in <lambda>
  File "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 108, in wrapped
    result = f(pd.concat(value_series, axis=1))
  File "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-9-ab7e5a255fc4>", line 26, in evaluate_chip
  File "/data/hadoop/drive-k/yarn/nm/usercache/has1/appcache/application_1587490087127_1328/container_1587490087127_1328_01_000005/has1/lib64/python3.6/site-packages/keras_preprocessing/image/utils.py", line 113, in load_img
    with open(path, 'rb') as f:
TypeError: expected str, bytes or os.PathLike object, not _io.BytesIO

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:172)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.agg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:624)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:407)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1363)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:413)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)


Blacklisting behavior can be configured via spark.blacklist.*.

	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2178)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1035)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1017)
	at org.apache.spark.rdd.RDD$$anonfun$takeOrdered$1.apply(RDD.scala:1439)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.takeOrdered(RDD.scala:1426)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:136)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2758)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


# I've only Found One Issue

So far, the issue I found was that `toPandas()` on a dataframe no longer works. We might be able to fix this, but at this point it's a small loss. You can use `.collect()` and `.show` to view data instead.

In [0]:
ic_summary.toPandas()

  not enough values to unpack (expected 3, got 2)


ValueError: not enough values to unpack (expected 3, got 2)

In [0]:
spark.stop()