In [115]:
import pandas as pd
import pickle
from pyspark import ml
from pyspark.ml.feature import VectorAssembler

pd.set_option('display.max_colwidth', None)

In [116]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ml-anom').getOrCreate()
processed_input = "assets/log_data.csv"

input_df = spark.read.csv(processed_input,header='true')
input_df.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- process_id: string (nullable = true)
 |-- username: string (nullable = true)
 |-- ip: string (nullable = true)
 |-- is_private: string (nullable = true)
 |-- is_root: string (nullable = true)
 |-- is_failure: string (nullable = true)
 |-- time_since_last_failure: string (nullable = true)
 |-- time_since_last_failure_of_same_type: string (nullable = true)
 |-- failure_count_in_last_15_mins: string (nullable = true)
 |-- failure_count_in_last_30_mins: string (nullable = true)
 |-- failure_count_in_last_60_mins: string (nullable = true)
 |-- label_auth_failure: string (nullable = true)
 |-- label_break_in_attempt: string (nullable = true)
 |-- label_connection_closed: string (nullable = true)
 |-- label_disconnect: string (nullable = true)
 |-- label_failed_password: string (nullable = true)
 |-- label_invalid_user: string (nullable = true)
 |-- label_no_label: string (nullable = true)
 |-- label_no_identification: string (nullable = true

In [117]:
input_df.show()

+------------+----------+---------+---------------+----------+-------+----------+-----------------------+------------------------------------+-----------------------------+-----------------------------+-----------------------------+------------------+----------------------+-----------------------+----------------+---------------------+------------------+--------------+-----------------------+-----+
|   timestamp|process_id| username|             ip|is_private|is_root|is_failure|time_since_last_failure|time_since_last_failure_of_same_type|failure_count_in_last_15_mins|failure_count_in_last_30_mins|failure_count_in_last_60_mins|label_auth_failure|label_break_in_attempt|label_connection_closed|label_disconnect|label_failed_password|label_invalid_user|label_no_label|label_no_identification|class|
+------------+----------+---------+---------------+----------+-------+----------+-----------------------+------------------------------------+-----------------------------+------------------------

In [118]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col

df2 = input_df.withColumn("is_private",col("is_private").cast(DoubleType())) \
.withColumn("is_root",col("is_root").cast(DoubleType())) \
.withColumn("is_failure",col("is_failure").cast(DoubleType())) \
.withColumn("time_since_last_failure_of_same_type",col("time_since_last_failure_of_same_type").cast(DoubleType())) \
.withColumn("failure_count_in_last_15_mins",col("failure_count_in_last_15_mins").cast(DoubleType())) \
.withColumn("failure_count_in_last_30_mins",col("failure_count_in_last_30_mins").cast(DoubleType())) \
.withColumn("failure_count_in_last_60_mins",col("failure_count_in_last_60_mins").cast(DoubleType())) \
.withColumn("label_auth_failure",col("label_auth_failure").cast(DoubleType())) \
.withColumn("label_break_in_attempt",col("label_break_in_attempt").cast(DoubleType())) \
.withColumn("label_connection_closed",col("label_connection_closed").cast(DoubleType())) \
.withColumn("label_disconnect",col("label_disconnect").cast(DoubleType())) \
.withColumn("label_failed_password",col("label_failed_password").cast(DoubleType())) \
.withColumn("label_invalid_user",col("label_invalid_user").cast(DoubleType())) \
.withColumn("label_no_label",col("label_no_label").cast(DoubleType())) \
.withColumn("label_no_identification",col("label_no_identification").cast(DoubleType())) \
.withColumn("class",col("class").cast(DoubleType()))
df2.show()


+------------+----------+---------+---------------+----------+-------+----------+-----------------------+------------------------------------+-----------------------------+-----------------------------+-----------------------------+------------------+----------------------+-----------------------+----------------+---------------------+------------------+--------------+-----------------------+-----+
|   timestamp|process_id| username|             ip|is_private|is_root|is_failure|time_since_last_failure|time_since_last_failure_of_same_type|failure_count_in_last_15_mins|failure_count_in_last_30_mins|failure_count_in_last_60_mins|label_auth_failure|label_break_in_attempt|label_connection_closed|label_disconnect|label_failed_password|label_invalid_user|label_no_label|label_no_identification|class|
+------------+----------+---------+---------------+----------+-------+----------+-----------------------+------------------------------------+-----------------------------+------------------------

In [119]:
df2 = df2.drop("timestamp", "process_id", "username", "ip", "time_since_last_failure")
df2.show()

+----------+-------+----------+------------------------------------+-----------------------------+-----------------------------+-----------------------------+------------------+----------------------+-----------------------+----------------+---------------------+------------------+--------------+-----------------------+-----+
|is_private|is_root|is_failure|time_since_last_failure_of_same_type|failure_count_in_last_15_mins|failure_count_in_last_30_mins|failure_count_in_last_60_mins|label_auth_failure|label_break_in_attempt|label_connection_closed|label_disconnect|label_failed_password|label_invalid_user|label_no_label|label_no_identification|class|
+----------+-------+----------+------------------------------------+-----------------------------+-----------------------------+-----------------------------+------------------+----------------------+-----------------------+----------------+---------------------+------------------+--------------+-----------------------+-----+
|       0.0|    

In [120]:
from pyspark.ml.feature import VectorAssembler

vecCols = ['is_private', 'is_root', 'is_failure', 'time_since_last_failure_of_same_type', 'failure_count_in_last_15_mins',
       'failure_count_in_last_30_mins', 'failure_count_in_last_60_mins','label_auth_failure', 'label_break_in_attempt',
       'label_connection_closed', 'label_disconnect', 'label_failed_password',
       'label_invalid_user', 'label_no_label', 'label_no_identification']
assembler = VectorAssembler(inputCols=feature_list, outputCol="vectors")
df2 = assembler.transform(df2)

In [121]:
train, test = df2.randomSplit([0.7, 0.3], seed = 2018)

In [122]:
from pyspark.ml.classification import RandomForestClassifier

model = RandomForestClassifier(featuresCol = 'vectors', labelCol = 'class')
ranF = model.fit(train)

In [124]:
pickle.dump(ranF, open('ml_py_model.pkl', 'wb'))

TypeError: cannot pickle '_thread.RLock' object

In [125]:
pred = ranF.transform(test)

In [126]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

eval = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction")
accuracy = eval.evaluate(pred)
print("Accuracy = %s" % (accuracy))

Accuracy = 1.0
