In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("pyspark-kafka-streaming").\
        master("spark://spark-master:7077").\
        config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0"). \
        config("spark.executor.memory", "512m").\
        getOrCreate()

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4393facb-ccdd-479f-9b9c-0a00d1d6e1cf;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 631ms :: artifacts dl 25

In [2]:
df_streamed_raw = (spark
  .readStream
  # Add your code here
  .option("kafka.bootstrap.servers", "kafka:9093")
  .format("kafka")
  .option("subscribe", "topic")
  .load())

In [3]:
from pyspark.sql.types import StringType, ArrayType, FloatType, IntegerType
from pyspark.sql.functions import *

df_streamed_kv = (df_streamed_raw
    .withColumn("key", df_streamed_raw["key"].cast(StringType()))
    .withColumn("value", df_streamed_raw["value"].cast(StringType())))

test_query = (df_streamed_kv
                .writeStream
                .format("memory")
              .outputMode("update")
              .queryName("test_query_table")
                .start())

23/05/01 16:54:24 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-dd0cd504-6e31-4cc6-b0dc-b8da74a02caf. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
[Stage 0:>                                                          (0 + 1) / 1]

In [5]:
spark.sql("Select * from test_query_table").show(100)

+---+--------------------+-----+---------+------+--------------------+-------------+
|key|               value|topic|partition|offset|           timestamp|timestampType|
+---+--------------------+-----+---------+------+--------------------+-------------+
|key|{"timestamp": 167...|topic|        0| 24202|2023-05-01 16:54:...|            0|
|key|                null|topic|        0| 24203|2023-05-01 16:54:...|            0|
|key|                null|topic|        0| 24204|2023-05-01 16:54:...|            0|
|key|{"timestamp": 167...|topic|        0| 24205|2023-05-01 16:54:...|            0|
|key|{"timestamp": 167...|topic|        0| 24206|2023-05-01 16:54:...|            0|
|key|{"timestamp": 167...|topic|        0| 24207|2023-05-01 16:54:...|            0|
|key|{"timestamp": 167...|topic|        0| 24208|2023-05-01 16:54:...|            0|
|key|                null|topic|        0| 24209|2023-05-01 16:54:...|            0|
|key|                null|topic|        0| 24210|2023-05-01 16:54

test_query.stop()

In [6]:
df_anti_null  = df_streamed_kv.filter("value IS NOT NULL")

In [7]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, BooleanType, LongType, IntegerType


event_schema = StructType([
    StructField("timestamp", StringType()),
    StructField("process_id", StringType()),
    StructField("username", StringType()),
    StructField("ip", StringType()),
    StructField("is_private", StringType()),
    StructField("is_root", StringType()),
    StructField("is_failure", StringType()),
    StructField("time_since_last_failure", StringType()),
    StructField("time_since_last_failure_of_same_type", StringType()),
    StructField("failure_count_in_last_15_mins", StringType()),
    StructField("failure_count_in_last_30_mins", StringType()),
    StructField("failure_count_in_last_60_mins", StringType()),
    StructField("label_auth_failure", StringType()),
    StructField("label_break_in_attempt", StringType()),
    StructField("label_connection_closed", StringType()),
    StructField("label_disconnect", StringType()),
    StructField("label_failed_password", StringType()),
    StructField("label_invalid_user", StringType()),
    StructField("label_no_label", StringType()),
    StructField("label_no_identification", StringType()),
    StructField("class", StringType())
])

# Parse the events from JSON format
df_parsed = (df_anti_null
           # Sets schema for event data
           .withColumn("value", from_json("value", event_schema))
          )

In [8]:
df_formatted = (df_parsed.select(
#     col("key").alias("event_key")
#     ,col("topic").alias("event_topic")
# #     ,col("timestamp").alias("event_timestamp")
#     ,col("value.timestamp")
#     ,col("value.process_id")
#     ,col("value.username")
#     ,col("value.ip")
    col("value.is_private").cast(FloatType())
    ,col("value.is_root").cast(FloatType())
    ,col("value.is_failure").cast(FloatType())
#     ,col("value.time_since_last_failure")
    ,col("value.time_since_last_failure_of_same_type").cast(FloatType())
    ,col("value.failure_count_in_last_15_mins").cast(FloatType())
    ,col("value.failure_count_in_last_30_mins").cast(FloatType())
    ,col("value.failure_count_in_last_60_mins").cast(FloatType())
    ,col("value.label_auth_failure").cast(FloatType())
    ,col("value.label_break_in_attempt").cast(FloatType())
    ,col("value.label_connection_closed").cast(FloatType())
    ,col("value.label_disconnect").cast(FloatType())
    ,col("value.label_failed_password").cast(FloatType())
    ,col("value.label_invalid_user").cast(FloatType())
    ,col("value.label_no_label").cast(FloatType())
    ,col("value.label_no_identification").cast(FloatType())
    ,col("value.class").cast(FloatType())
))

In [9]:
df_formatted

DataFrame[is_private: float, is_root: float, is_failure: float, time_since_last_failure_of_same_type: float, failure_count_in_last_15_mins: float, failure_count_in_last_30_mins: float, failure_count_in_last_60_mins: float, label_auth_failure: float, label_break_in_attempt: float, label_connection_closed: float, label_disconnect: float, label_failed_password: float, label_invalid_user: float, label_no_label: float, label_no_identification: float, class: float]

In [10]:
df_format_null = df_formatted.filter("is_private IS NOT NULL")

In [11]:

query = (df_format_null
  .writeStream
  .format("console") \
  .trigger(processingTime = '2 seconds')\
 .start())

23/05/01 16:57:19 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-56600c33-5487-452c-8d8c-18f020f660df. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.


-------------------------------------------
Batch: 0
-------------------------------------------
+----------+-------+----------+------------------------------------+-----------------------------+-----------------------------+-----------------------------+------------------+----------------------+-----------------------+----------------+---------------------+------------------+--------------+-----------------------+-----+
|is_private|is_root|is_failure|time_since_last_failure_of_same_type|failure_count_in_last_15_mins|failure_count_in_last_30_mins|failure_count_in_last_60_mins|label_auth_failure|label_break_in_attempt|label_connection_closed|label_disconnect|label_failed_password|label_invalid_user|label_no_label|label_no_identification|class|
+----------+-------+----------+------------------------------------+-----------------------------+-----------------------------+-----------------------------+------------------+----------------------+-----------------------+----------------+------

In [12]:
query.stop()

In [13]:
from pyspark.ml.feature import VectorAssembler

vecCols = ['is_private', 'is_root', 'is_failure', 'time_since_last_failure_of_same_type', 'failure_count_in_last_15_mins',
       'failure_count_in_last_30_mins', 'failure_count_in_last_60_mins','label_auth_failure', 'label_break_in_attempt',
       'label_connection_closed', 'label_disconnect', 'label_failed_password',
       'label_invalid_user', 'label_no_label', 'label_no_identification']
assembler = VectorAssembler(inputCols=vecCols, outputCol="vectors")
df_assembler = assembler.transform(df_format_null)

In [14]:
from pyspark.ml.classification import RandomForestClassificationModel

model = RandomForestClassificationModel.load('/data/ml_py_model.pkl')

                                                                                

In [15]:
pred = model.transform(df_assembler)

In [16]:
df_out = pred.select("prediction")

In [None]:
df_out_query = (df_out
               .writeStream
               .format("console")
               .trigger(processingTime='5 seconds')
                .start()
               )

In [None]:
df_out_query.stop()

In [17]:
df_filtered = (df_out
              .filter("prediction == 1.0")
              )

In [18]:
df_fil_run = (df_filtered
             .writeStream
             .format("console")
             .trigger(processingTime='5 seconds')
             .start()
             )

23/05/01 16:59:37 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-1eff53c3-8a4a-460a-9298-17b9cc902079. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/05/01 16:59:37 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


-------------------------------------------
Batch: 0
-------------------------------------------
+----------+
|prediction|
+----------+
+----------+

-------------------------------------------
Batch: 1
-------------------------------------------
+----------+
|prediction|
+----------+
+----------+

-------------------------------------------
Batch: 2
-------------------------------------------
+----------+
|prediction|
+----------+
+----------+

-------------------------------------------
Batch: 3
-------------------------------------------
+----------+
|prediction|
+----------+
+----------+

-------------------------------------------
Batch: 4
-------------------------------------------
+----------+
|prediction|
+----------+
|       1.0|
+----------+

-------------------------------------------
Batch: 5
-------------------------------------------
+----------+
|prediction|
+----------+
|       1.0|
+----------+

-------------------------------------------
Batch: 6
---------------------

In [19]:
df_fil_run.stop()