In [1]:
import requests
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
from pyspark.ml.linalg import DenseVector
import numpy as np
import os

spark = SparkSession.builder.appName("AnomalyDetectionKMeans").getOrCreate()

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
local_path = "/tmp/iris.data"

response = requests.get(url)
with open(local_path, 'wb') as f:
    f.write(response.content)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/16 10:45:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
data = spark.read.csv(local_path, header=False, inferSchema=True)

data = data.withColumnRenamed("_c0", "sepal_length") \
           .withColumnRenamed("_c1", "sepal_width") \
           .withColumnRenamed("_c2", "petal_length") \
           .withColumnRenamed("_c3", "petal_width") \
           .withColumnRenamed("_c4", "class")

data = data.dropna()

assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)
assembled_data = assembler.transform(data)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(assembled_data)
scaled_data = scaler_model.transform(assembled_data)


In [3]:
k = 3
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("scaled_features").setPredictionCol("cluster")
model = kmeans.fit(scaled_data)
clustered_data = model.transform(scaled_data)

centers = model.clusterCenters()


24/09/16 10:45:54 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [4]:
def euclidean_distance(point, center):
    point_array = np.array(point)
    center_array = np.array(center)
    return float(np.linalg.norm(point_array - center_array))

distance_udf = udf(lambda point, cluster: euclidean_distance(point, centers[cluster]), DoubleType())

distance_data = clustered_data.withColumn("distance", distance_udf(col("scaled_features"), col("cluster")))

threshold = 1.0
anomalies = distance_data.filter(col("distance") > threshold)


In [5]:
from pyspark.sql.functions import avg

avg_distance = distance_data.groupBy("cluster").agg(avg("distance").alias("avg_distance"))


In [6]:
print("Clustered Data:")
clustered_data.show()

print("Anomalies Detected:")
anomalies.show()

print("Average Distance of Points from Cluster Centers:")
avg_distance.show()

print("Cluster Centers:")
for i, center in enumerate(centers):
    print(f"Cluster {i}: {center}")

os.remove(local_path)

spark.stop()


Clustered Data:
+------------+-----------+------------+-----------+-----------+-----------------+--------------------+-------+
|sepal_length|sepal_width|petal_length|petal_width|      class|         features|     scaled_features|cluster|
+------------+-----------+------------+-----------+-----------+-----------------+--------------------+-------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|[6.15892840883878...|      1|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|[5.9174018045706,...|      1|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|[5.67587520030241...|      1|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|[5.55511189816831...|      1|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|[6.03816510670469...|      1|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|[6.521218315241



+------------+-----------+------------+-----------+---------------+-----------------+--------------------+-------+------------------+
|sepal_length|sepal_width|petal_length|petal_width|          class|         features|     scaled_features|cluster|          distance|
+------------+-----------+------------+-----------+---------------+-----------------+--------------------+-------+------------------+
|         4.9|        3.0|         1.4|        0.2|    Iris-setosa|[4.9,3.0,1.4,0.2]|[5.9174018045706,...|      1| 1.028559608477677|
|         5.4|        3.9|         1.7|        0.4|    Iris-setosa|[5.4,3.9,1.7,0.4]|[6.52121831524107...|      1| 1.181517372451085|
|         4.4|        2.9|         1.4|        0.2|    Iris-setosa|[4.4,2.9,1.4,0.2]|[5.31358529390013...|      1|1.4540670850199389|
|         4.8|        3.0|         1.4|        0.1|    Iris-setosa|[4.8,3.0,1.4,0.1]|[5.79663850243650...|      1| 1.066927932824046|
|         4.3|        3.0|         1.1|        0.1|    Iris-se