In [46]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.clustering import KMeans

In [2]:
spark = SparkSession.builder.appName('RF_trainer').getOrCreate()

In [3]:
df_num = spark.read \
    .options(header = "true", sep=',', inferschema = "true") \
    .csv('../../data/df_num_imputed_2/*.csv').limit(10000)

In [4]:
df_num.show(3)

+------+----------------+----------------+-------------------------+-------------------------+-------------------+-----------------+------+-----------------+--------------+----------------------+-----------------+---------------------------+-------+-------+-----------+---------------+-----+---------------+--------+------------+------------------------+-------------------------+-------------------------+--------------------------------------+-------------------------------+-------------------------------+--------------------------------+--------------------------+-----------------------+-------------------------------------------------+-------------------------------------------------+-----------------------------------------------+-------------------------------------+--------------------+----------------------+----------------------------------+---------------------------+--------------------------------+--------------------------+------------------------+---------------------+-------

In [57]:
cols_sel = df_num.columns
cols_sel.remove('HasDetections')
cols_sel.remove('MachineIdentifier')

In [64]:
assembler_features = VectorAssembler(inputCols=cols_sel, outputCol='features')
train_data = assembler_features.transform(df_num)
train_data_final = train_data.select('features', 'MachineIdentifier')

In [65]:
train_data_final.persist()
train_data_final.count()

10000

In [66]:
train_data_final.show(3)

+--------------------+--------------------+
|            features|   MachineIdentifier|
+--------------------+--------------------+
|[0.0,7.0,0.0,0.0,...|99c804d47a37fee99...|
|[0.0,7.0,0.0,0.0,...|99c805687aec04861...|
|[0.0,7.0,0.0,0.0,...|99c806052bcc7295a...|
+--------------------+--------------------+
only showing top 3 rows



In [82]:
df_kmeans = train_data_final.select('MachineIdentifier')

In [67]:
k = 2

In [85]:
ks = [2, 4, 8, 16, 32]

for k in ks:
    kmeans = KMeans(predictionCol='prediction_{}'.format(k),                   
                featuresCol='features').setK(k).setSeed(1)
    
    model = kmeans.fit(train_data_final)
    df_tra = model.transform(train_data_final)
    df_kmeans = df_kmeans.join(df_tra.select('MachineIdentifier',
                                         'prediction_{}'.format(k) ),
                           ['MachineIdentifier'])

In [86]:
df_kmeans.show(6)

+--------------------+------------+------------+------------+------------+-------------+-------------+
|   MachineIdentifier|prediction_2|prediction_2|prediction_4|prediction_8|prediction_16|prediction_32|
+--------------------+------------+------------+------------+------------+-------------+-------------+
|99c804d47a37fee99...|           0|           0|           0|           6|            3|            0|
|99c805687aec04861...|           0|           0|           0|           5|            9|           16|
|99c806052bcc7295a...|           0|           0|           0|           2|           13|           13|
|99c807cca358a4da9...|           0|           0|           2|           1|            1|            4|
|99c807d05fc14d514...|           0|           0|           0|           2|           13|           13|
|99c808bf1c35d8eac...|           1|           1|           1|           3|           14|           27|
+--------------------+------------+------------+------------+------------

In [68]:
kmeans = KMeans(predictionCol='prediction_{}'.format(k),                   
                featuresCol='features').setK(k).setSeed(1)

In [69]:
model = kmeans.fit(train_data_final)

In [72]:
df_tra = model.transform(train_data_final)

In [73]:
df_tra.show(10)

+--------------------+--------------------+------------+
|            features|   MachineIdentifier|prediction_2|
+--------------------+--------------------+------------+
|[0.0,7.0,0.0,0.0,...|99c804d47a37fee99...|           0|
|[0.0,7.0,0.0,0.0,...|99c805687aec04861...|           0|
|[0.0,7.0,0.0,0.0,...|99c806052bcc7295a...|           0|
|[0.0,7.0,0.0,0.0,...|99c807cca358a4da9...|           0|
|[0.0,7.0,0.0,0.0,...|99c807d05fc14d514...|           0|
|[0.0,7.0,0.0,0.0,...|99c808bf1c35d8eac...|           1|
|[0.0,7.0,0.0,0.0,...|99c80c378c6e2a870...|           0|
|[0.0,7.0,0.0,0.0,...|99c80d365290a4967...|           1|
|[0.0,7.0,0.0,0.0,...|99c80e5410eb29910...|           0|
|[0.0,7.0,0.0,0.0,...|99c80f0c6c56a5a93...|           0|
+--------------------+--------------------+------------+
only showing top 10 rows



In [83]:
df_kmeans = df_kmeans.join(df_tra.select('MachineIdentifier',
                                         'prediction_{}'.format(k) ),
                           ['MachineIdentifier'])

In [84]:
df_kmeans.show()

+--------------------+------------+
|   MachineIdentifier|prediction_2|
+--------------------+------------+
|99c804d47a37fee99...|           0|
|99c805687aec04861...|           0|
|99c806052bcc7295a...|           0|
|99c807cca358a4da9...|           0|
|99c807d05fc14d514...|           0|
|99c808bf1c35d8eac...|           1|
|99c80c378c6e2a870...|           0|
|99c80d365290a4967...|           1|
|99c80e5410eb29910...|           0|
|99c80f0c6c56a5a93...|           0|
|99c810ef02ac5226b...|           1|
|99c812248891e0a6a...|           0|
|99c8148cc0f6bc1e0...|           0|
|99c815858aeaced1b...|           1|
|99c815b7d5c26432b...|           0|
|99c8178493698dfb0...|           1|
|99c817bbc956505bf...|           0|
|99c81ab0b3f6fb2a8...|           0|
|99c81bc07b472d7e4...|           0|
|99c81c10c1173bf60...|           0|
+--------------------+------------+
only showing top 20 rows



In [74]:
k = 4

In [78]:
kmeans4 = KMeans(predictionCol='prediction_{}'.format(k),
                featuresCol='features').setK(k).setSeed(1)

In [79]:
model4 = kmeans4.fit(train_data_final)

In [80]:
df_tra4 = model4.transform(train_data_final)

In [81]:
df_tra4.show(10)

+--------------------+--------------------+------------+
|            features|   MachineIdentifier|prediction_4|
+--------------------+--------------------+------------+
|[0.0,7.0,0.0,0.0,...|99c804d47a37fee99...|           0|
|[0.0,7.0,0.0,0.0,...|99c805687aec04861...|           0|
|[0.0,7.0,0.0,0.0,...|99c806052bcc7295a...|           0|
|[0.0,7.0,0.0,0.0,...|99c807cca358a4da9...|           2|
|[0.0,7.0,0.0,0.0,...|99c807d05fc14d514...|           0|
|[0.0,7.0,0.0,0.0,...|99c808bf1c35d8eac...|           1|
|[0.0,7.0,0.0,0.0,...|99c80c378c6e2a870...|           0|
|[0.0,7.0,0.0,0.0,...|99c80d365290a4967...|           1|
|[0.0,7.0,0.0,0.0,...|99c80e5410eb29910...|           0|
|[0.0,7.0,0.0,0.0,...|99c80f0c6c56a5a93...|           0|
+--------------------+--------------------+------------+
only showing top 10 rows



In [44]:
df_tra4.join(df_tra, ['MachineIdentifier']).show()

+--------------------+------------+------------+
|            features|prediction_4|prediction_2|
+--------------------+------------+------------+
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,7.0,0.0,0.0,...|           2|           0|
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,7.0,0.0,0.0,...|           1|           1|
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,7.0,0.0,0.0,...|           1|           1|
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,8.0,0.0,0.0,...|           1|           1|
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,7.0,0.0,0.0,...|           1|           1|
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,7.0,0.0,0.0,...|           1|           1|
|[0.0,7.0,0.0,0.0,...|           0|           0|
|[0.0,7.0,0.0,0.0,..