In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('clustering').getOrCreate()

In [3]:
import os

In [4]:
input_file = os.path.join(os.path.curdir, 'data', 'seeds_dataset.csv')

In [5]:
seeds_data = spark.read.csv(input_file,
                            inferSchema=True,
                            header=True)

In [7]:
seeds_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [8]:
seeds_data.head().asDict()

{'area': 15.26,
 'perimeter': 14.84,
 'compactness': 0.871,
 'length_of_kernel': 5.763,
 'width_of_kernel': 3.312,
 'asymmetry_coefficient': 2.221,
 'length_of_groove': 5.22}

In [11]:
seeds_data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [9]:
from pyspark.ml.clustering import KMeans

In [10]:
from pyspark.ml.feature import VectorAssembler

In [12]:
assembler = VectorAssembler(inputCols=[
    'area',
     'perimeter',
     'compactness',
     'length_of_kernel',
     'width_of_kernel',
     'asymmetry_coefficient',
     'length_of_groove'
], outputCol='features')

In [13]:
final_data = assembler.transform(seeds_data)

In [14]:
final_data.head().asDict()

{'area': 15.26,
 'perimeter': 14.84,
 'compactness': 0.871,
 'length_of_kernel': 5.763,
 'width_of_kernel': 3.312,
 'asymmetry_coefficient': 2.221,
 'length_of_groove': 5.22,
 'features': DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22])}

In [15]:
from pyspark.ml.feature import StandardScaler

In [16]:
scaler = StandardScaler(inputCol='features',
                        outputCol='scaledFeatures')

In [17]:
final_scaled_data = scaler.fit(final_data).transform(final_data)

In [18]:
final_scaled_data.head().asDict()

{'area': 15.26,
 'perimeter': 14.84,
 'compactness': 0.871,
 'length_of_kernel': 5.763,
 'width_of_kernel': 3.312,
 'asymmetry_coefficient': 2.221,
 'length_of_groove': 5.22,
 'features': DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]),
 'scaledFeatures': DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621])}

In [19]:
k_mean = KMeans(featuresCol='scaledFeatures',
               k=3)

In [32]:
model = k_mean.fit(final_scaled_data)

In [33]:
print('Cluster Centers', end='\n\n')
print(model.clusterCenters())

Cluster Centers

[array([ 4.88329439, 10.89351922, 37.27768719, 12.35125928,  8.56438491,
        1.80870857, 10.32847336]), array([ 6.31670546, 12.37109759, 37.39491396, 13.91155062,  9.748067  ,
        2.39849968, 12.2661748 ]), array([ 4.06133795, 10.13721767, 35.82681204, 11.81771972,  7.5087187 ,
        3.25852121, 10.4215732 ])]


In [38]:
print('Within Set Sum Squared Error', end='\n\n')
print(model.summary.trainingCost)

Within Set Sum Squared Error

429.0286883592525


In [39]:
model.transform(final_scaled_data).show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+----------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      scaledFeatures|prediction|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+----------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|[5.24452795332028...|         0|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|[5.11393027165175...|         0|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|[4.91116018695588...|         0|
|13.84|    13.94|     0.8955

In [40]:
model.transform(final_scaled_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows



In [41]:
hack_data_file = os.path.join(os.path.curdir, 'data', 'hack_data.csv')

In [43]:
hack_data = spark.read.csv(hack_data_file,
                            inferSchema=True,
                            header=True)

In [44]:
hack_data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [46]:
hack_data.head().asDict()

{'Session_Connection_Time': 8.0,
 'Bytes Transferred': 391.09,
 'Kali_Trace_Used': 1,
 'Servers_Corrupted': 2.96,
 'Pages_Corrupted': 7.0,
 'Location': 'Slovenia',
 'WPM_Typing_Speed': 72.37}

In [47]:
hack_data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [50]:
hack_assembler = VectorAssembler(inputCols=[
     'Session_Connection_Time',
     'Bytes Transferred',
     'Kali_Trace_Used',
     'Servers_Corrupted',
     'Pages_Corrupted',
     'WPM_Typing_Speed'
], outputCol='features')

In [51]:
final_assembled_data = hack_assembler.transform(hack_data)

In [52]:
final_scaled_data = scaler.fit(final_assembled_data).transform(final_assembled_data)

In [53]:
final_scaled_data.head().asDict()

{'Session_Connection_Time': 8.0,
 'Bytes Transferred': 391.09,
 'Kali_Trace_Used': 1,
 'Servers_Corrupted': 2.96,
 'Pages_Corrupted': 7.0,
 'Location': 'Slovenia',
 'WPM_Typing_Speed': 72.37,
 'features': DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]),
 'scaledFeatures': DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963])}

In [55]:
k_mean_one = KMeans(featuresCol='scaledFeatures',
               k=3)

In [56]:
k_mean_two = KMeans(featuresCol='scaledFeatures',
               k=2)

In [57]:
model_one = k_mean_one.fit(final_scaled_data)

In [58]:
model_two = k_mean_two.fit(final_scaled_data)

In [59]:
print(model_one.summary.trainingCost)

434.1492898715821


In [60]:
print(model_two.summary.trainingCost)

601.7707512676691


In [63]:
model_one.transform(final_scaled_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   83|
|         2|   84|
|         0|  167|
+----------+-----+



In [64]:
model_two.transform(final_scaled_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+

