In [None]:
### INSERT SPARKDL INTO NOTEBOOK
import os
SUBMIT_ARGS = "--packages databricks:spark-deep-learning:1.2.0-spark2.3-s_2.11 pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS

import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Identifying pneumonia images using pyspark").getOrCreate()

In [None]:
### LOAD THE TRAIN IMAGES
### WE ASSUME YOU HAVE DOWNLOADED OUR WHOLE FOLDER, WHICH CONTAINS THE IMAGES ASWELL
### THIS IS IMPORTANT SINCE WE HAVE MODIFIED THE ORIGINAL.ZIP FOR IT TO TAKE LESS SPACE
from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit
from sparkdl.image import imageIO

normal_df_train = ImageSchema.readImages("chest_xray/train/NORMAL").withColumn("label", lit(0))
pneumonia_df_train = ImageSchema.readImages("chest_xray/train/PNEUMONIA").withColumn("label", lit(1))
train_df = normal_df_train.unionAll(pneumonia_df_train)
train_df = train_df.repartition(100)

In [None]:
### LOAD THE TEST IMAGES
### WE ASSUME YOU HAVE DOWNLOADED OUR WHOLE FOLDER, WHICH CONTAINS THE IMAGES ASWELL
### THIS IS IMPORTANT SINCE WE HAVE MODIFIED THE ORIGINAL.ZIP FOR IT TO TAKE LESS SPACE
normal_df_test = ImageSchema.readImages("chest_xray/test/NORMAL").withColumn("label", lit(0))
pneumonia_df_test = ImageSchema.readImages("chest_xray/test/PNEUMONIA").withColumn("label", lit(1))
test_df = normal_df_test.unionAll(pneumonia_df_test)
test_df = test_df.repartition(100)

In [None]:
### TRAIN OUR MODEL
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer

featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])
p_model = p.fit(train_df)

In [None]:
### TESTING OUR MODEL ACCURACY
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

tested_df = p_model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label"))))