In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install pyspark
!pip install numpy
!pip install Pillow
!pip install petastorm
!pip install hyperopt

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=2b171970bda6822b29e6c3b43e6ed242ddf5d8424937967f5af07d246c498cfa
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
Collecting petastorm
  Downloading petastorm-0.12.1-py2.py3-none-any.whl (284 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.0/284.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill>=0.2.1 (from petastorm)
  Downloading dill-0.3.7-

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.functions
from pyspark.sql.types import *

In [None]:
from petastorm.spark import SparkDatasetConverter, make_spark_converter

  from pyarrow import LocalFileSystem


In [None]:
import io
import numpy as np
from PIL import Image
from functools import partial
from petastorm import TransformSpec
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK

In [None]:
from tensorflow import keras
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
import tensorflow as tf

In [None]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

import mlflow
import mlflow.keras
import mlflow.tensorflow

In [None]:
spark = SparkSession.builder.appName("Distributed training with Tensorflow").getOrCreate()

In [None]:
data_path = "/content/drive/MyDrive/hello_world_dataset"
mlflow_model_dir_path = "/"

Enable mlflow tracking

In [None]:
# enable mlflow tracking
mlflow.set_experiment(mlflow_model_dir_path)

mlflow.tensorflow.autolog()

2024/01/17 08:09:17 INFO mlflow.tracking.fluent: Experiment with name '/' does not exist. Creating a new experiment.


params

In [None]:
IMG_SHAPE = (224, 224, 3)
BATCH_SIZE=1

# The number of epoches is a hyperparameter that defines the number times that
# the learning algorithm will work through the entire training dataset.
# One epoch means that each sample in the training dataset has an opportunity
# to update the internal model parameters.
SAMPLE_SIZE = 5
NUM_EPOCHS = 1
NUM_EXECUTERS = 1

# Load preprocessed data

In [None]:
# Read the training data stored in parquet, limit the dataset for the example
df_parquet = spark.read.parquet(data_path)

In [None]:
df_parquet.printSchema()

root
 |-- id: integer (nullable = true)
 |-- image1: binary (nullable = true)
 |-- array_4d: binary (nullable = true)



In [None]:
df_parquet.select("id").distinct().show()

+---+
| id|
+---+
|  3|
|  5|
|  9|
|  4|
|  8|
|  0|
|  1|
|  6|
|  7|
|  2|
+---+



In [None]:
num_classes=10

## Split data into train and test data

In [None]:
df = df_parquet.select(col("image1"), col("id").cast(LongType())).limit(SAMPLE_SIZE)

In [None]:
df_train, df_val = df.randomSplit([0.6, 0.4], seed=12345)

In [None]:
df_train.printSchema()

root
 |-- image1: binary (nullable = true)
 |-- id: long (nullable = true)



## Cache the Spark Dataframe using Petastorm Spark Converter

In [None]:
tmp_path = "file:/content/drive/MyDrive/petastorm"

# Set a cache directory on DBFS FUSE for intermediate data
spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, tmp_path)

In [None]:
# Use a low value for parquet_row_group_bytes. The default of 32 MiB can be too
converter_train = make_spark_converter(df_train, parquet_row_group_size_bytes=3200000)
# test
converter_val = make_spark_converter(df_val, parquet_row_group_size_bytes=3200000)

  self._filesystem = pyarrow.localfs


## Tensforflow batchsize needs to be smaller than training datasize

In [None]:
print(f"train: {len(converter_train)}, test: {len(converter_val)}")

train: 5, test: 0


In [None]:
MobileNetV2(input_shape=IMG_SHAPE, include_top=False, weights='imagenet').summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
Model: "mobilenetv2_1.00_224"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 Conv1 (Conv2D)              (None, 112, 112, 32)         864       ['input_1[0][0]']             
                                                                                                  
 bn_Conv1 (BatchNormalizati  (None, 112, 112, 32)         128       ['Conv1[0][0]']               
 on)                                                                                              
                

In [None]:
def get_model(lr=0.001):
  # Create the base model from the pre-trained model MobileNet v2
  base_model = MobileNetV2(input_shape=IMG_SHAPE, include_top=False, weights='imagenet')

  # Freeze parameters in the feature extraction layers
  base_model.trainable = False

  # Add a new classifier layer for transfer learning
  global_average_layer = keras.layers.GlobalAveragePooling2D()
  prediction_layer = keras.layers.Dense(num_classes)

  model = keras.Sequential([
      base_model,
      global_average_layer,
      prediction_layer
  ])

  return model

## Develop the pyfunc wrapper for the model

In [None]:
## Create custom python pyfunc model that transforms and predictions on inference
# data. it allows the inference pipeline to be independent of the model framework
# used in tranining

class KerasCNNModelWrapper(mlflow.pyfunc.PythonModel):
  def __init__(self, model_path):
    self.model_path = model_path

  def load_context(self, context):
    # load the keras-native representation of the MLflow model
    print(self.model_path)
    self.model = mlflow.keras.load_model(model_uri=self.model_path)

  def predict(self, context, model_input):
    import tensorflow as tf
    import json

    class_def = {
        0: '212.teapot',
        1: '234.tweezer',
        2: '196.spaghetti',
        3: '249.yo-yo',
    }

    model_input['origin'] = model_input['origin'].str.replace("dbfs:", "/dbfs")
    images = model_input['origin']

    rtn_df = model_input.iloc[:,0:1]
    rtn_df['prediction'] = None
    rtn_df['probabilities'] = None

    for index, row in model_input.iterrows():
      image = np.round(np.array(image.open(row['origin']).resize(224, 224)), dtype=np.float32)
      img = tf.reshape(image, shape=[-1, 224, 224, 3])
      class_probs = self.model.predict(img)
      classes = np.argmax(class_probs, axis=1)
      class_prob_dict = dict()
      for key, val in class_def.items():
        class_prob_dict[val] = np.round(np.float(class_prob[0][int(key)]), 3).tolist()
      rtn_df.loc[index, 'prediction'] = classes[0]
      rtn_df.loc[index, 'probabilities'] = json.dumps(class_prob_dict)

    return rtn_df[['prediction', 'probabilities']].values.tolist()

## Petastorm preprocess

## Generate petastorm dataset

In [5]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

from petastorm.codecs import ScalarCodec, CompressedImageCodec, NdarrayCodec
from petastorm.etl.dataset_metadata import materialize_dataset
from petastorm.unischema import dict_to_spark_row, Unischema, UnischemaField

HelloWorldSchema = Unischema('HelloWorldSchema',[
  UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
  UnischemaField('image1', np.uint8, (128, 256, 3), CompressedImageCodec('png'), False),
  UnischemaField('array_4d', np.uint8, (None, 128, 30, None), NdarrayCodec(), False),
])

def row_generator(x):
  """
    Returns a single entry in the generated dataset. Return a bunch of random values as an example.
  """

  return {
      'id': x,
      'image1': np.random.randint(0, 255, dytpe=np.uint8, size=(128, 256, 3)),
      'array_4d': np.random.randint(0, 255, dtype=np.uint8, size=(4, 128, 30, 3))
  }

def generate_petastorm_dataset(ouput_url="file://hello_world_dataset"):
  rowgroup_size_mb=256

  spark = SparkSession.builder.config("spark.driver.memory", "2g").master('local[2]').getOrCreate()
  sc = spark.sparkContext

  # Wrap dataset materialization portion. Will take care of setting up spark environment variable
  # well as save petastorm specific metadata.

  rows_count = 10
  with materialize_dataset(spark, output_url, HelloWorldSchema, rowgroup_size_mb):
    rows_rdd = sc.parallelism(range(rows_count)).map(row_generator).map(lambda x: dict_to_spark(HelloWorldSchema, x))

    spark.createDataFrame(rows_rdd, HelloWorldSchema.as_spark_schema()).coalesce(10).write.model('overwrite').parquet(output_url)

  generate_petastorm_dataset()