In [None]:
import pandas as pd
import pickle
from pyspark import ml
from pyspark.ml.feature import VectorAssembler

# pd.set_option('display.max_colwidth', None)

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('ml-anom').getOrCreate()
processed_input = "/data/log_data.csv"

input_df = spark.read.csv(processed_input,header='true')
input_df.printSchema()

In [None]:
# input_df.show()

In [None]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col

df2 = input_df.withColumn("is_private",col("is_private").cast(DoubleType())) \
.withColumn("is_root",col("is_root").cast(DoubleType())) \
.withColumn("is_failure",col("is_failure").cast(DoubleType())) \
.withColumn("time_since_last_failure_of_same_type",col("time_since_last_failure_of_same_type").cast(DoubleType())) \
.withColumn("failure_count_in_last_15_mins",col("failure_count_in_last_15_mins").cast(DoubleType())) \
.withColumn("failure_count_in_last_30_mins",col("failure_count_in_last_30_mins").cast(DoubleType())) \
.withColumn("failure_count_in_last_60_mins",col("failure_count_in_last_60_mins").cast(DoubleType())) \
.withColumn("label_auth_failure",col("label_auth_failure").cast(DoubleType())) \
.withColumn("label_break_in_attempt",col("label_break_in_attempt").cast(DoubleType())) \
.withColumn("label_connection_closed",col("label_connection_closed").cast(DoubleType())) \
.withColumn("label_disconnect",col("label_disconnect").cast(DoubleType())) \
.withColumn("label_failed_password",col("label_failed_password").cast(DoubleType())) \
.withColumn("label_invalid_user",col("label_invalid_user").cast(DoubleType())) \
.withColumn("label_no_label",col("label_no_label").cast(DoubleType())) \
.withColumn("label_no_identification",col("label_no_identification").cast(DoubleType())) \
.withColumn("class",col("class").cast(DoubleType()))
# df2.show()


In [None]:
df2 = df2.drop("timestamp", "process_id", "username", "ip", "time_since_last_failure")
# df2.show()

In [None]:
from pyspark.ml.feature import VectorAssembler

vecCols = ['is_private', 'is_root', 'is_failure', 'time_since_last_failure_of_same_type', 'failure_count_in_last_15_mins',
       'failure_count_in_last_30_mins', 'failure_count_in_last_60_mins','label_auth_failure', 'label_break_in_attempt',
       'label_connection_closed', 'label_disconnect', 'label_failed_password',
       'label_invalid_user', 'label_no_label', 'label_no_identification']
assembler = VectorAssembler(inputCols=vecCols, outputCol="vectors")
df2 = assembler.transform(df2)

In [None]:
train, test = df2.randomSplit([0.7, 0.3], seed = 2018)

In [None]:
from pyspark.ml.classification import RandomForestClassifier

model = RandomForestClassifier(featuresCol = 'vectors', labelCol = 'class')
ranF = model.fit(train)

In [None]:
# pickle.dump(model, open('ml_py_model.pkl', 'wb'))
ranF.write().overwrite().save('ml_py_model.pkl')

In [None]:
from pyspark.ml.classification import RandomForestClassificationModel

model = RandomForestClassificationModel.load('ml_py_model.pkl')
pred = model.transform(test)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

eval = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction")
accuracy = eval.evaluate(pred)
print("Accuracy = %s" % (accuracy))