In [1]:
import findspark
findspark.init('/home/shashank/spark-2.3.2-bin-hadoop2.7')
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('c_clust').getOrCreate()

In [4]:
data = spark.read.csv('hack_data.csv', inferSchema=True, header=True)

In [5]:
data.createOrReplaceTempView('data')

In [7]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [8]:
data.head()

Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)

In [11]:
spark.sql("FROM data SELECT COUNT(Kali_Trace_Used)").show()

+----------------------+
|count(Kali_Trace_Used)|
+----------------------+
|                   334|
+----------------------+



In [15]:
ref_data = spark.sql("FROM data SELECT Session_Connection_Time, `Bytes Transferred`, Kali_Trace_Used, Servers_Corrupted, Pages_Corrupted, WPM_Typing_Speed")

In [17]:
from pyspark.ml.feature import VectorAssembler

In [18]:
ref_data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed']

In [19]:
assembler = VectorAssembler(inputCols=ref_data.columns, outputCol='features')

In [20]:
final_data = assembler.transform(ref_data)

In [21]:
final_data.head()

Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]))

In [22]:
from pyspark.ml.feature import StandardScaler

In [23]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [24]:
final_data=scaler.fit(final_data).transform(final_data)
final_data.head()

Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]), scaledFeatures=DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963]))

In [25]:
from pyspark.ml.clustering import KMeans

In [30]:
#k = 2
kmeans2 = KMeans(featuresCol='scaledFeatures', k=2, maxIter=100, seed = 101)
model2 = kmeans2.fit(final_data)

#k = 3
kmeans3 = KMeans(featuresCol='scaledFeatures', k=3, maxIter=100, seed = 101)
model3 = kmeans3.fit(final_data)

In [31]:
#Within Set Sum of Standard Squares
print("WSSE - 2 Clusters")
print(model2.computeCost(final_data))

print("WSSE - 3 Clusters")
print(model3.computeCost(final_data))

WSSE - 2 Clusters
601.7707512676716
WSSE - 3 Clusters
434.1492898715845


In [32]:
clusters_2 = model2.transform(final_data)
clusters_3 = model3.transform(final_data)

In [33]:
clusters_2.createOrReplaceTempView('clusters_2')
clusters_3.createOrReplaceTempView('clusters_3')

In [35]:
spark.sql("FROM clusters_2 SELECT prediction AS Clusters_2, AVG(WPM_Typing_Speed) GROUP BY prediction").show()

+----------+---------------------+
|Clusters_2|avg(WPM_Typing_Speed)|
+----------+---------------------+
|         1|     70.6329341317365|
|         0|    44.05185628742513|
+----------+---------------------+



In [36]:
spark.sql("FROM clusters_3 SELECT prediction AS Clusters_3, AVG(WPM_Typing_Speed) GROUP BY prediction").show()

+----------+---------------------+
|Clusters_3|avg(WPM_Typing_Speed)|
+----------+---------------------+
|         1|    70.96493975903616|
|         2|    70.30488095238096|
|         0|    44.05185628742513|
+----------+---------------------+



There does not seem to be a much difference between 1 and 2 in the second model. Likely that there are only two sources and not three.