In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('HACKING INVESTIGATION').getOrCreate()

In [3]:
data = spark.read.csv('hack_data.csv',inferSchema=True,header=True)

In [16]:
cols= data.columns

In [5]:
data.take(1)[0].asDict()

{'Session_Connection_Time': 8.0,
 'Bytes Transferred': 391.09,
 'Kali_Trace_Used': 1,
 'Servers_Corrupted': 2.96,
 'Pages_Corrupted': 7.0,
 'Location': 'Slovenia',
 'WPM_Typing_Speed': 72.37}

In [6]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [9]:
from pyspark.sql.functions import countDistinct


In [11]:
data.select(countDistinct('Location')).show()

+------------------------+
|count(DISTINCT Location)|
+------------------------+
|                     181|
+------------------------+



In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler,StandardScaler
from pyspark.ml import  Pipeline

In [21]:
cols

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed']

In [23]:
stages=[]
assembler = VectorAssembler(inputCols=cols,outputCol='features')
scaler = StandardScaler(inputCol=assembler.getOutputCol(),outputCol='scaled data')
stages=[assembler,scaler]

In [24]:
pipeline = Pipeline(stages=stages)
pipeline_model =pipeline.fit(data)
data= pipeline_model.transform(data)


In [18]:
from pyspark.ml.clustering import KMeans

In [25]:
final_data = data.select('scaled data')

In [38]:
kmeans3 =KMeans(featuresCol='scaled data',k=3)
kmeans2 =KMeans(featuresCol='scaled data',k=2)

In [40]:
kmeans_model3 = kmeans3.fit(final_data)
kmeans_model2= kmeans2.fit(final_data)

In [46]:
print('wsse= ',kmeans_model3.computeCost(final_data),'for k = 3')
print('wsse= ',kmeans_model2.computeCost(final_data),'for k= 2')

wsse=  434.75507308487647 for k = 3
wsse=  601.7707512676716 for k= 2


In [49]:
for i in range(2,9):
    kmeans_ =KMeans(featuresCol='scaled data',k=i)
    kmeans_model = kmeans_.fit(final_data)
    wsse = kmeans_model.computeCost(final_data)
    print('\n k={}'.format(i),wsse)
    


 k=2 601.7707512676716

 k=3 434.75507308487647

 k=4 414.2454928518

 k=5 250.99598608541095

 k=6 231.567466961589

 k=7 221.17669596570335

 k=8 217.04273469475203


In [50]:
prediction3 = kmeans_model3.transform(final_data)
prediction2 = kmeans_model2.transform(final_data)

In [32]:
prediction3.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   88|
|         0|   79|
+----------+-----+



In [51]:
prediction2.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



<bound method KMeansModel.clusterCenters of KMeans_4fb99bf9a5d027002676>
