In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('hacker_cluster').getOrCreate()

In [2]:
data = spark.read.csv('/FileStore/tables/hack_data.csv', inferSchema=True, header=True)

In [3]:
data.show()

In [4]:
data = data.drop('location')

In [5]:
data.show()

In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')
final_data = assembler.transform(data)

In [8]:
from pyspark.ml.feature import StandardScaler

In [9]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [10]:
scaler_model = scaler.fit(final_data)

In [11]:
final_data = scaler_model.transform(final_data)

In [12]:
from pyspark.ml.clustering import KMeans

In [13]:
# Since we are not sure where there were 2 or 3 hackers, we'll make 2 models
kmeans_2 = KMeans(featuresCol='scaledFeatures', k=2)
kmeans_3 = KMeans(featuresCol='scaledFeatures', k=3)

In [14]:
model_2 = kmeans_2.fit(final_data)
model_3 = kmeans_3.fit(final_data)

In [15]:
print('WSSSE')
print(model_2.computeCost(final_data))
print(model_3.computeCost(final_data))

In [16]:
centers_2 = model_2.clusterCenters()
centers_2

# 2 distinct centroids so seems like 2 hackers

In [17]:
centers_3 = model_3.clusterCenters()
centers_3

# Here the 2nd and 3rd centroids are very close to one another. This confirms that were 2 hackers

In [18]:
# The class split is even which means 2 hackers
model_2.transform(final_data).select('prediction').groupBy('prediction').count().show()

In [19]:
# The class splits are not even which means there weren't 3 hackers
model_3.transform(final_data).select('prediction').groupBy('prediction').count().show()