<a href="https://colab.research.google.com/github/rjahin/Prac_Image_Processing/blob/main/imageprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pyspark opencv-python numpy scikit-learn



In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("ImageProcessingML") \
    .getOrCreate()

In [None]:
import cv2
import numpy as np

def extract_features(image_path):
    img = cv2.imread(image_path)
    img = cv2.resize(img, (64, 64))
    return img.flatten()

In [None]:
import os

def load_data(base_path):
    data = []
    labels = []
    for label in os.listdir(base_path):
        folder_path = os.path.join(base_path, label)
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            try:
                features = extract_features(file_path)
                data.append((features.tolist(), label))
            except:
                pass
    return data


In [None]:
data = load_data("/content/dataset")

In [None]:
sc = spark.sparkContext
rdd = sc.parallelize(data) #To process your data in parallel



In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row

df = rdd.map(lambda x: Row(features=Vectors.dense(x[0]), label=x[1])).toDF()


In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="label", outputCol="label_index")
df = indexer.fit(df).transform(df)

In [None]:
train_data, test_data = df.randomSplit([0.8, 0.2])

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label_index")
model = lr.fit(train_data)

In [None]:
predictions = model.transform(test_data)
predictions.select("label", "label_index", "prediction").show(10)

+-----+-----------+----------+
|label|label_index|prediction|
+-----+-----------+----------+
|  dog|        1.0|       1.0|
|  cat|        0.0|       1.0|
+-----+-----------+----------+



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.50
