In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('find_hacker').getOrCreate()

In [3]:
df = spark.read.csv('hack_data.csv',inferSchema=True,header=True)

In [4]:
from pyspark.ml.clustering import KMeans

In [5]:
df.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [6]:
df.describe().show()

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       null|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       null| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

In [7]:
df.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
assembler = VectorAssembler(inputCols=['Session_Connection_Time',
                                     'Bytes Transferred',
                                     'Kali_Trace_Used',
                                     'Servers_Corrupted',
                                     'Pages_Corrupted',
                                     'WPM_Typing_Speed'],
                            outputCol='features')

In [10]:
final_data = assembler.transform(df)

In [11]:
from pyspark.ml.feature import StandardScaler

In [19]:
scaler = StandardScaler(inputCol='features',outputCol='scaledFeatures',withStd=True,withMean=False)

In [20]:
# compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(final_data)

In [21]:
# Normalize each feature to unit std
cluster_final_data = scalerModel.transform(final_data)

In [22]:
# Now find out its 2 or 3 hackers
kmeans2 = KMeans(featuresCol='scaledFeatures',k=2)
kmeans3 = KMeans(featuresCol='scaledFeatures',k=3)

In [23]:
model_k2 = kmeans2.fit(cluster_final_data)
model_k3 = kmeans3.fit(cluster_final_data)

In [24]:
wssse_k2 = model_k2.computeCost(cluster_final_data)
wssse_k3 = model_k3.computeCost(cluster_final_data)

In [25]:
print('with k=2')
print('within set sum of squred errors = ' + str(wssse_k2))
print('--'*30)
print('with k=3')
print('within set sum of squred errors = ' + str(wssse_k3))

with k=2
within set sum of squred errors = 601.7707512676716
------------------------------------------------------------
with k=3
within set sum of squred errors = 434.75507308487647


In [26]:
# its 2 or 3
model_k2.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [27]:
model_k3.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   79|
|         2|   88|
|         0|  167|
+----------+-----+

