In [None]:
pip install matplotlib

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("pyspark-kafka-streaming").\
        master("spark://spark-master:7077").\
        config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0"). \
        config("spark.executor.memory", "512m").\
        getOrCreate()

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-be014968-2deb-4a59-9200-74f40ce602aa;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 699ms :: artifacts dl 13

In [3]:
data_dict = {}

def update_dict(batch_df, epoch_id):
    print(batch_df)
    data_dict['foo'] = {}
    # Convert the batch DataFrame to a list of dictionaries
    batch_list = batch_df.rdd.map(lambda x: x.asDict()).collect()
    for record in batch_list:
        print(record)
        block_id = record["id"]
        log_type = record["type"]
        if block_id not in data_dict:
            data_dict[block_id] = {}
        if log_type not in data_dict[block_id]:
            data_dict[block_id][log_type] = 1
        else:
            data_dict[block_id][log_type] += 1
            
# block_1; {log1: 3, log:4}    

In [4]:
def get_event_type_index(event_type):
    if event_type == 'PacketResponder * from block * terminating':
        return 0
    if event_type == 'Received block * from *':
        return 1
    if event_type == 'Verification succeeded for *':
        return 2
    if event_type == 'Receiving block * src: * des: *':
        return 3
    if event_type == 'Served block * to *':
        return 4
    if event_type == 'blockMap updated: * is added to *':
        return 5
    if event_type == 'block * allocated':
        return 6
    if event_type == '* is added to invalidSet of *':
        return 7
    if event_type == 'ask * to replicate * to datanode(s) *':
        return 8
    if event_type == 'Transmitted block * to *':
        return 9
    if event_type == 'Starting thread to transfer block * to *':
        return 10
    if event_type == 'Deleting block * file *':
        return 11
    if event_type == 'Unexpected error trying to delete block *':
        return 12
    if event_type == 'Got exception while serving * to *':
        return 13
    
    return 14

In [5]:
event_count_dict = {}

def create_event_vector(data_dict):
    for block_id, event_dict in data_dict.items():
        event_count_vector = [0] * 15  # Replace 15 with the total number of event types

        for event_type, frequency in event_dict.items():
            index = get_event_type_index(event_type)
            event_count_vector[index] = frequency

        event_count_dict[block_id] = event_count_vector

In [6]:
df_streamed_raw = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "kafka:9093")
  .option("subscribe", "topic_test")
  .load())

In [7]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import col

# convert byte stream to string
df_streamed_kv = (df_streamed_raw
    .withColumn("key", df_streamed_raw["key"].cast(StringType()))
    .withColumn("value", df_streamed_raw["value"].cast(StringType())))

In [8]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, StringType

event_schema = StructType([
    StructField("id", StringType()),
    StructField("type", StringType())
])

# Parse the events from JSON format
df_parsed = (df_streamed_kv
           # Sets schema for event data
           .withColumn("value", from_json("value", event_schema))
          )

In [9]:
df_formatted = (df_parsed.select(
    col("value.id").alias("id")
    ,col("value.type").alias("type")
))

In [10]:
test_query = (df_formatted 
              .writeStream \
              .format("memory") # output to memory \
              .outputMode("append") 
              .queryName("test_query_table")  # Name of the in memory table \
              .start())

23/05/01 18:12:41 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-4981b264-5082-42fd-aad7-b3f4e84eb867. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.


In [11]:
import csv

anomaly_map = {}

def fill_anomaly_map(file):
    with open(file, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            block_id = row["BlockId"]
            block_id = block_id.replace("blk_", "")
            label = row["Label"]
            if label == "Anomaly":
                anomaly_map[block_id] = 1
            elif label == "Normal":
                anomaly_map[block_id] = 0


fill_anomaly_map('input_1.csv')
fill_anomaly_map('input_2.csv')


[Stage 0:>                                                          (0 + 1) / 1]

In [None]:
from collections import defaultdict
import time

def dicter(result):
    result_dict = defaultdict(dict)

    # Convert the result DataFrame to a list of Row objects
    rows = result.collect()

    for row in rows:
        block_id = row["id"]
        event_type = row["type"]
        count = row["count(1)"]
        result_dict[block_id][event_type] = count
    return result_dict


i = 0
accuracy = []
while True:
    print(i)
    i = i + 1
    result = spark.sql("select id, type, count(*) from test_query_table group by id, type")
    result_dict=dicter(result)
    create_event_vector(result_dict)
    inner_list1 = []
    inner_list2 = []
    inner_list3 = []
    for key, value in event_count_dict.items():
        if key in anomaly_map:
            inner_list1.append(key)
            inner_list2.append(value)
            inner_list3.append(int(anomaly_map[key]))
        
    per_anomalies = sum(inner_list3)/len(inner_list3)
    algo1(inner_list1,inner_list2,inner_list3,per_anomalies)

0


                                                                                

Inside kmeans clustering with input length:  1862




F1 score till now: 0.6885751968804955
Precision score till now: 0.8038365583546306
Recall score till now: 0.6401024130190797
Confusion Matrix:
True Negatives: 1769
False Positives: 13
False Negatives: 57
True Positives: 23
1




Inside kmeans clustering with input length:  1911
F1 score till now: 0.6925850224266167
Precision score till now: 0.8550102951269732
Recall score till now: 0.63636603306003
Confusion Matrix:
True Negatives: 1820
False Positives: 8
False Negatives: 60
True Positives: 23
2




Inside kmeans clustering with input length:  1929
F1 score till now: 0.6965477461673405
Precision score till now: 0.8807622504537205
Recall score till now: 0.6369290814395175
Confusion Matrix:
True Negatives: 1840
False Positives: 6
False Negatives: 60
True Positives: 23
3




Inside kmeans clustering with input length:  1956


In [24]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN


In [22]:
def algo1(blk_num,input_data,true_labels, per_anomalies):
    print('Inside kmeans clustering with input length: ', len(blk_num))

    # Convert the input data to a NumPy array
    data = np.array(input_data)

    # Scale the data to normalize the features
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)

    # Define the number of clusters you want to create
    num_clusters = 3

    # Apply KMeans clustering algorithm
    kmeans = KMeans(n_clusters=num_clusters)
    clusters = kmeans.fit_predict(data_scaled)

    # Calculate the distances from each point to its cluster centroid
    distances = kmeans.transform(data_scaled)
    min_distances = np.min(distances, axis=1)

    # Determine the threshold for the top 5% farthest points
    threshold = np.percentile(min_distances, 100-100*per_anomalies)

    # Label points as anomalies if their distance is greater than the threshold
    anomalies = min_distances > threshold
    predicted_labels = anomalies.astype(int)
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    # Calculate precision and recall
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    print('F1 score till now:',f1)
    print('Precision score till now:',precision)
    print('Recall score till now:',recall)
    tn, fp, fn, tp = confusion_matrix(true_labels, predicted_labels).ravel()

    # print confusion matrix
    print("Confusion Matrix:")
    print("True Negatives:", tn)
    print("False Positives:", fp)
    print("False Negatives:", fn)
    print("True Positives:", tp)

In [20]:
def algo2(blk_num,input_data,true_labels, per_anomalies):
    print('Inside kNN algo with input length: ', len(blk_num))
    # Convert the input data to a NumPy array
    data = np.array(input_data)

    # Scale the data to normalize the features
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)

    # Set the number of nearest neighbors to consider
    k = 5

    # Calculate the distances to the K nearest neighbors for each data point
    nearest_neighbors = NearestNeighbors(n_neighbors=k+1)  # +1 to exclude the point itself
    nearest_neighbors.fit(data_scaled)
    distances, _ = nearest_neighbors.kneighbors(data_scaled)

    # Get the average distance to the K nearest neighbors (excluding the point itself)
    avg_distances = np.mean(distances[:, 1:], axis=1)

    # Determine the threshold for the top 5% farthest points
    threshold = np.percentile(avg_distances, 100-100*per_anomalies)

    # Label points as anomalies if their average distance is greater than the threshold
    anomalies = avg_distances > threshold

    # Convert the boolean anomaly flags to binary values (1 for anomaly, 0 for normal)
    predicted_labels = anomalies.astype(int)
    f1 = f1_score(true_labels, predicted_labels, average = 'macro')
    precision = precision_score(true_labels, predicted_labels,average = 'macro')
    recall = recall_score(true_labels, predicted_labels,average = 'macro')
    print('F1 score till now:',f1)
    print('Precision score till now:',precision)
    print('Recall score till now:',recall)
    # calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(true_labels, predicted_labels).ravel()

    # print confusion matrix
    print("Confusion Matrix:")
    print("True Negatives:", tn)
    print("False Positives:", fp)
    print("False Negatives:", fn)
    print("True Positives:", tp)

In [21]:
def algo3(blk_num,input_data,true_labels, per_anomalies):
    print('Inside DBSCAN clustering with input length: ', len(blk_num))
  
    # Convert the input data to a NumPy array
    data = np.array(input_data)

    # Scale the data to normalize the features
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)

    # Apply DBSCAN clustering algorithm
    dbscan = DBSCAN(eps=0.5, min_samples=2)  # Tune these parameters based on your data
    clusters = dbscan.fit_predict(data_scaled)

    # Calculate the distances to the nearest neighbor for each data point
    nearest_neighbors = NearestNeighbors(n_neighbors=2)
    nearest_neighbors.fit(data_scaled)
    distances, _ = nearest_neighbors.kneighbors(data_scaled)

    # Get the distances to the nearest neighbor (excluding itself)
    nearest_distances = distances[:, 1]

    # Determine the threshold for the top 5% farthest points
    threshold = np.percentile(nearest_distances, 100-(100*per_anomalies))

    # Label points as anomalies if their distance is greater than the threshold
    anomalies = nearest_distances > threshold

    predicted_labels = anomalies.astype(int)
    f1 = f1_score(true_labels, predicted_labels, average = 'macro')
    precision = precision_score(true_labels, predicted_labels,average = 'macro')
    recall = recall_score(true_labels, predicted_labels,average = 'macro')
    print('F1 score till now:',f1)
    print('Precision score till now:',precision)
    print('Recall score till now:',recall)
    # calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(true_labels, predicted_labels).ravel()

    # print confusion matrix
    print("Confusion Matrix:")
    print("True Negatives:", tn)
    print("False Positives:", fp)
    print("False Negatives:", fn)
    print("True Positives:", tp)

