In [1]:
 #Goal is to build a model that analyses data about a hacking attack, and use insights from that analysis to find out the number of hacking groups involved.
# dataset contains 334 attack instances, with the following information for each one:

#Session_Connection_Time (How long the session lasted in minutes)
#Bytes Transferred (Megabytes transferred during session)
#Kali_Trace_Used (Whether the hacker was using Kali Linux)
#Servers_Corrupted (Number of server corrupted during the attack)
#Pages_Corrupted (Number of pages illegally accessed)
#Location (Location attack came from)
#WPM_Typing_Speed (Estimated typing speed based on session logs)

# Ref:https://medium.com/tensorist/using-k-means-to-analyse-hacking-attacks-81957c492c93

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

In [3]:
spark = SparkSession.builder.appName('hack-clustering').getOrCreate()
data = spark.read.csv('/FileStore/tables/hack_data.csv', header=True, inferSchema=True)

In [4]:
data.printSchema()


In [5]:
cols = ['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed']

In [6]:
# convert the input cols to vector 
assembler = VectorAssembler(inputCols=cols, outputCol='features')


In [7]:
assembled_data = assembler.transform(data)


In [8]:
# standardise data using Feature scaling
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [9]:
scaler_model = scaler.fit(assembled_data)
scaled_data = scaler_model.transform(assembled_data)
scaled_data.printSchema()

In [10]:
# Build cluster models
# To tackle the question of whether there were two hackers or three, we can create two k-means models. One model will be initialized with two clusters (k = 2), and the other will be initialized with three clusters (k = 3). 

k_means_2 = KMeans(featuresCol='scaledFeatures', k=2)
k_means_3 = KMeans(featuresCol='scaledFeatures', k=3)

In [11]:
# Fit model on Scaled data

In [12]:
model_k2 = k_means_2.fit(scaled_data)
model_k3 = k_means_3.fit(scaled_data)

In [13]:
model_k3_data = model_k3.transform(scaled_data)
model_k3_data.groupBy('prediction').count().show()

In [14]:
model_k2_data = model_k2.transform(scaled_data)
model_k2_data.groupBy('prediction').count().show()

In [15]:
#Both clusters here have exactly the same number of instances assigned to them, and this perfectly aligns with the idea of hackers trading off attacks.

#Therefore, it is highly likely that only two hackers were involved with the attacks.#