In [None]:
# pip install pyspark pillow pandas pyarrow tensorflow

In [None]:
%%sh
# prepare dataset using linux command - random 100 images from imagenette2
mkdir -p ./data/images/mixed
find ./imagenette2 -maxdepth 4 -type f | sort -R | head -100 | xargs -I{} cp {} ./data/images/mixed

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
            .appName("image-ml")\
            .config("spark.executor.memory", "1g")\
            .config("spark.driver.memory", "6g")\
            .getOrCreate()

24/12/27 11:37:51 WARN Utils: Your hostname, nilesh-pc resolves to a loopback address: 127.0.1.1; using 192.168.1.101 instead (on interface wlp0s20f3)
24/12/27 11:37:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/27 11:38:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from typing import Iterator
import numpy as np
import pandas as pd
import tensorflow as tf
from PIL import Image, ImageDraw
from pyspark.sql import SparkSession
from pyspark.sql.functions import PandasUDFType, col, pandas_udf
from pyspark.sql.types import (ArrayType, BinaryType, FloatType, IntegerType,
                               StringType, StructField, StructType)
from tensorflow.keras.applications.resnet50 import ResNet50

2024-12-27 11:38:40.092024: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-27 11:38:40.137725: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-27 11:38:40.524034: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-27 11:38:40.526176: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
images_dir = "/home/nilesh/newdemo/data/images/mixed"
image_df = spark.read.format("image").load(images_dir).filter("image.nChannels > 2 AND image.height < 1000")
image_df.printSchema()
# to load binary data -- format=binaryFile
# to load image data -- format=image
image_df.select("image.origin", "image.height", "image.width", "image.mode", "image.nChannels").show(5, truncate=False)

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+------------------------------------------------------------------+------+-----+----+---------+
|origin                                                            |height|width|mode|nChannels|
+------------------------------------------------------------------+------+-----+----+---------+
|file:///home/nilesh/newdemo/data/images/mixed/n02102040_1910.JPEG |375   |500  |16  |3        |
|file:///home/nilesh/newdemo/data/images/mixed/n02102040_1078.JPEG |500   |377  |16  |3        |
|file:///home/nilesh/newdemo/data/images/mixed/n01440764_11170.JPEG|333   |500  |16  |3        |
|file:///home/nilesh/newdemo/data/images/mixed/n03417042_1840.JPEG |395   |500  |16  |3        |
|file:///home/nilesh/newdemo/data/images/mixed/n03417042_6130.JPEG |389   |500  |16  |3        |
+------------------------------------------------------------------+------+-----+----+---------+
only showing top 5 rows



                                                                                

In [4]:
image_row = 13

In [5]:
spark_single_img = image_df.select("image").collect()[image_row]
print(spark_single_img.image.origin, spark_single_img.image.mode, spark_single_img.image.nChannels )

mode = 'RGBA' if (spark_single_img.image.nChannels == 4) else 'RGB' 
Image.frombytes(mode=mode, data=bytes(spark_single_img.image.data), size=[spark_single_img.image.width,spark_single_img.image.height]).show()

                                                                                

file:///home/nilesh/newdemo/data/images/mixed/n03417042_750.JPEG 16 3



(eog:104057): Atk-CRITICAL **: 11:42:00.753: atk_object_ref_state_set: assertion 'ATK_IS_OBJECT (accessible)' failed

(eog:104057): Gtk-CRITICAL **: 11:42:00.753: gtk_accessible_get_widget: assertion 'GTK_IS_ACCESSIBLE (accessible)' failed


In [6]:
def convert_bgr_array_to_rgb_array(img_array):
    B, G, R = img_array.T
    return np.array((R, G, B)).T

img = Image.frombytes(mode=mode, data=bytes(spark_single_img.image.data), size=[spark_single_img.image.width,spark_single_img.image.height])

converted_img_array = convert_bgr_array_to_rgb_array(np.asarray(img))
Image.fromarray(converted_img_array).show()


(eog:105073): Atk-CRITICAL **: 11:45:34.062: atk_object_ref_state_set: assertion 'ATK_IS_OBJECT (accessible)' failed

(eog:105073): Gtk-CRITICAL **: 11:45:34.062: gtk_accessible_get_widget: assertion 'GTK_IS_ACCESSIBLE (accessible)' failed


In [7]:
schema = StructType(image_df.select("image.*").schema.fields + [
    StructField("data_as_resized_array", ArrayType(IntegerType()), True),
    StructField("data_as_array", ArrayType(IntegerType()), True)
])

def resize_img(img_data, resize=True):
    mode = 'RGBA' if (img_data.nChannels == 4) else 'RGB' 
    img = Image.frombytes(mode=mode, data=img_data.data, size=[img_data.width, img_data.height])
    img = img.convert('RGB') if (mode == 'RGBA') else img
    img = img.resize([224, 224], resample=Image.Resampling.BICUBIC) if (resize) else img
    arr = convert_bgr_array_to_rgb_array(np.asarray(img))
    arr = arr.reshape([224*224*3]) if (resize) else arr.reshape([img_data.width*img_data.height*3])
    return arr

def resize_image_udf(dataframe_batch_iterator: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    for dataframe_batch in dataframe_batch_iterator:
        dataframe_batch["data_as_resized_array"] = dataframe_batch.apply(resize_img, args=(True,), axis=1)
        dataframe_batch["data_as_array"] = dataframe_batch.apply(resize_img, args=(False,), axis=1)
        yield dataframe_batch

resized_df = image_df.select("image.*").mapInPandas(resize_image_udf, schema)

In [8]:
row = resized_df.collect()[image_row]

Image.frombytes(mode='RGB', data=bytes(row.data_as_array), size=[row.width,row.height]).show()

Image.frombytes(mode='RGB', data=bytes(row.data_as_resized_array), size=[224,224]).show()

                                                                                


(eog:105985): Atk-CRITICAL **: 11:48:28.703: atk_object_ref_state_set: assertion 'ATK_IS_OBJECT (accessible)' failed

(eog:105985): Gtk-CRITICAL **: 11:48:28.703: gtk_accessible_get_widget: assertion 'GTK_IS_ACCESSIBLE (accessible)' failed

(eog:105985): Atk-CRITICAL **: 11:48:28.703: atk_object_ref_state_set: assertion 'ATK_IS_OBJECT (accessible)' failed

(eog:105985): Gtk-CRITICAL **: 11:48:28.703: gtk_accessible_get_widget: assertion 'GTK_IS_ACCESSIBLE (accessible)' failed


In [9]:
def normalize_array(arr):
    return tf.keras.applications.resnet50.preprocess_input(arr.reshape([224,224,3]))

@pandas_udf(ArrayType(FloatType()))
def predict_batch_udf(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
    model = ResNet50()
    for input_array in iterator:
        normalized_input = np.stack(input_array.map(normalize_array))
        preds = model.predict(normalized_input)
        yield pd.Series(list(preds))

predicted_df = resized_df.withColumn("predictions", predict_batch_udf("data_as_resized_array"))

In [10]:
prediction_row = predicted_df.collect()[image_row]

tf.keras.applications.resnet50.decode_predictions(
    np.array(prediction_row.predictions).reshape(1,1000), top=5
)

2024-12-27 11:52:21.530334: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-27 11:52:21.536691: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-27 11:52:21.620306: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-27 11:52:21.621343: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-27 11:52:24.548041: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You m

[[('n03417042', 'garbage_truck', 0.9985939860343933),
  ('n03345487', 'fire_engine', 0.000993802328594029),
  ('n04487081', 'trolleybus', 0.00013196909276302904),
  ('n04467665', 'trailer_truck', 8.483155397698283e-05),
  ('n04461696', 'tow_truck', 6.104258500272408e-05)]]