# Transfer learning

In [0]:
import tensorflow
import keras
import h5py

## Download the flower dataset

In [0]:
%sh
curl -O http://download.tensorflow.org/example_images/flower_photos.tgz
tar xzf flower_photos.tgz &>/dev/null
cd flower_photos
ls
pwd

In [0]:
img_dir = 'file:/databricks/driver/flower_photos'

## Load images into Dataframe

In [0]:
df = spark.read.format('image').load(img_dir)
df.show()
df.printSchema()

## Create the train and test class

In [0]:
from pyspark.sql.functions import lit

In [0]:
# create Tulips and Sunflowers labels datasets, literally label each value as 0 (tulips) and 1 (sunflower) accordingly.

tulips_df = spark.read.format('image').load(img_dir + '/tulips').withColumn('label', lit(0))
sunflower_df = spark.read.format('image').load(img_dir + '/sunflowers').withColumn('label', lit(1))

In [0]:
# split train, test set
# tulips_train, tulips_test, _ = tulips_df.randomSplit([0.08, 0.08, 0.84])
# sunflower_train, sunflower_test, _ = sunflower_df.randomSplit([0.08, 0.08, 0.84])

tulips_train, tulips_test = tulips_df.randomSplit([0.8, 0.2], 42)
sunflower_train, sunflower_test = sunflower_df.randomSplit([0.8, 0.2], 42)

# combine those 2 flower types together again as whole train, test datasets
train_df = tulips_train.union(sunflower_train)
test_df = tulips_test.union(sunflower_test)

# as partition is quite expensive, we will re-partition to smaller block
train_df = train_df.repartition(100)
test_df = test_df.repartition(100)

## Create the custom classifier model, using pipeline

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer

In [0]:
featurizer = DeepImageFeaturizer(inputCol='image', outputCol='features', modelName='ResNet50')

In [0]:
# we will use logistic regression because we want to classify 2 types of flowers
lr = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.03, labelCol='label')

In [0]:
# we will use Pipeline
pipeline = Pipeline(stages=[featurizer, lr])

## Train the model

In [0]:
flower_model = pipeline.fit(train_df)

-----------

## Evaluate the Performance

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
pred = flower_model.transform(test_df)

In [0]:
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

print('Test dataset Accuracy: {}'.format(evaluator.evaluate(pred.select('prediction', 'label'))))

## Using a test image

In [0]:
t = '/FileStore/tables/tulip.jpg'
s = '/FileStore/tables/sunflower.jpg'

In [0]:
# Load the images for testing

# tulips
df = spark.read.format('image').load(t)
df.show()

In [0]:
# make the predictions with our model
flower_model.transform(df).show()

## model predictions result
our model has predicted that the image is 99.94% tulips, which is correct.

In [0]:
# sunflower image
df = spark.read.format('image').load(s)
df.show()

In [0]:
# make the predictions with our model
flower_model.transform(df).show()

## model predictions result
This time our model predicted that image is sunflower with probability of 0.113%, which is not very confident. but it correctly classfied the flower type though.