In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [0]:
data = spark.read.option("header", "true").option("inferSchema", "true").csv('/FileStore/tables/crime_data.csv')

In [0]:
data.count()

In [0]:
#dataframe is RDD so any rdd method can be applied to dataframe
display(data)

_c0,crime$cluster,Murder,Assault,Robbery,Drugs
Alabama,4,13.2,236,58,21.2
Alaska,4,10.0,263,48,44.5
Arizona,4,8.1,294,80,31.0
Arkansas,3,8.8,190,50,19.5
California,4,9.0,276,91,40.6
Colorado,3,7.9,204,78,38.7
Connecticut,2,3.3,110,77,11.1
Delaware,4,5.9,238,72,15.8
Florida,4,15.4,335,80,31.9
Georgia,3,17.4,211,60,25.8


In [0]:
data.show()

In [0]:
df = data.select("_c0","Murder", "Assault", "Robbery", "Drugs")

In [0]:
assembler = VectorAssembler(inputCols = ["Murder","Assault", "Robbery", "Drugs"], outputCol = "features")
fitted = assembler.transform(df)

In [0]:
kmeans = KMeans().setK(4).setSeed(1)
model = kmeans.fit(fitted)
predictions = model.transform(fitted)

In [0]:
display(predictions)

_c0,Murder,Assault,Robbery,Drugs,features,prediction
Alabama,13.2,236,58,21.2,"Map(vectorType -> dense, length -> 4, values -> List(13.2, 236.0, 58.0, 21.2))",1
Alaska,10.0,263,48,44.5,"Map(vectorType -> dense, length -> 4, values -> List(10.0, 263.0, 48.0, 44.5))",3
Arizona,8.1,294,80,31.0,"Map(vectorType -> dense, length -> 4, values -> List(8.1, 294.0, 80.0, 31.0))",3
Arkansas,8.8,190,50,19.5,"Map(vectorType -> dense, length -> 4, values -> List(8.8, 190.0, 50.0, 19.5))",1
California,9.0,276,91,40.6,"Map(vectorType -> dense, length -> 4, values -> List(9.0, 276.0, 91.0, 40.6))",3
Colorado,7.9,204,78,38.7,"Map(vectorType -> dense, length -> 4, values -> List(7.9, 204.0, 78.0, 38.7))",1
Connecticut,3.3,110,77,11.1,"Map(vectorType -> dense, length -> 4, values -> List(3.3, 110.0, 77.0, 11.1))",2
Delaware,5.9,238,72,15.8,"Map(vectorType -> dense, length -> 4, values -> List(5.9, 238.0, 72.0, 15.8))",1
Florida,15.4,335,80,31.9,"Map(vectorType -> dense, length -> 4, values -> List(15.4, 335.0, 80.0, 31.9))",3
Georgia,17.4,211,60,25.8,"Map(vectorType -> dense, length -> 4, values -> List(17.4, 211.0, 60.0, 25.8))",1


In [0]:
#silhouette width analysis
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

In [0]:
# Print results.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)