# Run Keras Models in Parallel with Spark

In [1]:
import import_ipynb
from data603 import SparkLauncher

# get a configuration object
conf = SparkLauncher.get_spark_conf()

# add a file to the configuration that will get copied to all the nodes on the cluster
conf.set('spark.yarn.dist.files', './keras_data/mobilenet_1_0_224_tf.h5')

# launch the cluster using the configuration
spark = SparkLauncher.get_spark_session(pack_venv = False, conf = conf)


importing Jupyter notebook from /scratch/data603/klucar/data603/SparkLauncher.ipynb
Creating Spark Configuration
Creating Spark Configuration
Setting Environment Variables
Creating Spark Session: klucar_data603_spark_session


## Read Data

In [2]:
image_chips = spark.read.parquet("/user/klucar/image_chips.parquet")

In [3]:
image_chips.count()

10176

In [4]:
len(image_chips.columns)

17

# Run Keras on Image Chip Data

Similar to how the image chips were extracted and how they were written to HDFS, run a Keras prediction model on the image chip. This evaluate chip code is a UDF that encapsulates the Keras model running code from another notebook.

What's new about this particular UDF is that it returns a MapType. This map is the predicted label and the predicted score.

In [6]:
def evaluate_chip(chip_data):
    import io
    import os
    from keras.applications.mobilenet import MobileNet
    from keras.applications.mobilenet import preprocess_input
    from keras.applications.mobilenet import decode_predictions
    from keras.preprocessing.image import load_img
    from keras.preprocessing.image import img_to_array

    # Load the image
    img = load_img(io.BytesIO(chip_data), target_size = (224,224))

    # Prepare Image
    image = img_to_array(img)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)

    # Load Model Data
    model = MobileNet(weights = f'{os.getcwd()}/mobilenet_1_0_224_tf.h5',
                 include_top = True,
                 alpha = 1.0)
    
    # Run prediction
    yhat = model.predict(image)

    # Decode Predictions
    label = decode_predictions(yhat)
    label = label[0][0]

    ret = {label[1]: float(label[2])}   

    return ret

## Wrap Keras Evaluation in a UDF

In [15]:
# make a UDF
from pyspark.sql.types import *
from pyspark.sql.functions import udf

schema = MapType(StringType(), DoubleType())

udf_evaluate_chip = udf(evaluate_chip, schema)


## apply the udf to the chip_data row.

In [None]:
# get rid of the original image data
image_chips = image_chips.drop('Data')

# evaluate image chips
image_chips = image_chips.withColumn("prediction", udf_evaluate_chip("chip_data"))

# Force UDF Evaluation

In [None]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import map_keys
from pyspark.sql.functions import map_values
from pyspark.sql.functions import col

# Extract the prediction labels and confidence values from the returned map
predictions = image_chips.select(explode(col("prediction")).alias("predicted_label", "predicted_score"))

# View Results

In [None]:
preds = predictions.groupby('predicted_label').count().sort(col("count").desc())

In [13]:
preds.show(6)

+--------------------+-----+
|     predicted_label|count|
+--------------------+-----+
|               goose| 2098|
|             ostrich|  587|
|red-breasted_merg...|  583|
|               drake|  462|
|         black_stork|  248|
|        fox_squirrel|  241|
+--------------------+-----+
only showing top 6 rows



In [14]:
spark.stop()