In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.feature import Imputer
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
import numpy as np

# Start Spark session
spark = SparkSession.builder.appName("DataPreprocessing").getOrCreate()

# Load data
data = spark.read.csv("kddcup.data_10_percent_corrected", header=False, inferSchema=True)

# Define the schema based on the dataset
data = data.toDF(*[f"col{i}" for i in range(1, 43)] + ["label"])

# Handle missing values
feature_cols = data.columns[4:-1]  # All numerical features except the label

imputer = Imputer(inputCols=feature_cols, outputCols=feature_cols).setStrategy("mean")
data = imputer.fit(data).transform(data)

# Scale numerical features
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
data = scaler.fit(data).transform(data)

data.show(3)

# K-means clustering
kmeans = KMeans(k=80, seed=1, featuresCol="scaled_features", predictionCol="cluster")
model = kmeans.fit(data)
clusters = model.transform(data)

# Get cluster centers
centers = np.array(model.clusterCenters())

# Calculate distances from points to cluster centers
def distance_to_center(features, center):
    return float(np.sqrt(np.sum((np.array(features) - np.array(center)) ** 2)))

distance_udf = udf(lambda features: min([distance_to_center(features, center) for center in centers]), FloatType())
clusters = clusters.withColumn("distance_to_center", distance_udf(col("scaled_features")))

# Define anomaly if distance is greater than a threshold
threshold = 1.0  # Set a suitable threshold
clusters = clusters.withColumn("is_anomaly", col("distance_to_center") > threshold)

clusters.show(3)

# Evaluating K-Means Clustering for anomalies
evaluator = ClusteringEvaluator(featuresCol="scaled_features", predictionCol="cluster")
silhouette = evaluator.evaluate(clusters)
print(f"Silhouette with squared Euclidean distance = {silhouette}")

# Print the number of anomalies and normal points
num_anomalies = clusters.filter(col("is_anomaly")).count()
num_normal = clusters.filter(~col("is_anomaly")).count()
total_count = clusters.count()

print(f"Number of anomalies: {num_anomalies}")
print(f"Number of normal points: {num_normal}")
print(f"Total number of points: {total_count}")


24/11/02 22:00:57 WARN Utils: Your hostname, sathvik-HP-EliteBook-x360-1030-G2 resolves to a loopback address: 127.0.1.1; using 192.168.0.108 instead (on interface wlp58s0)
24/11/02 22:00:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/02 22:00:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/02 22:01:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/home/sathvik/Documents/pyspark/kddcup.data_10_percent_corrected.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.feature import Imputer
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
import numpy as np

# Start Spark session
spark = SparkSession.builder.appName("DataPreprocessing").getOrCreate()

# Load data
data = spark.read.csv("kddcup.data_10_percent_corrected", header=False, inferSchema=True)

# Define the schema based on the dataset
data = data.toDF("col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9", "col10", 
                  "col11", "col12", "col13", "col14", "col15", "col16", "col17", "col18", "col19", "col20", 
                  "col21", "col22", "col23", "col24", "col25", "col26", "col27", "col28", "col29", "col30", 
                  "col31", "col32", "col33", "col34", "col35", "col36", "col37","col38","col39","col40", "col41","label")

# Handle missing values
imputer = Imputer(inputCols=["col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13",
                             "col14", "col15", "col16", "col17", "col18", "col19", "col20", "col21", "col22",
                             "col23", "col24", "col25", "col26", "col27", "col28", "col29", "col30", "col31",
                             "col32", "col33", "col34", "col35", "col36", "col37","col38","col39","col40", "col41"],
                   outputCols=["col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13",
                               "col14", "col15", "col16", "col17", "col18", "col19", "col20", "col21", "col22",
                               "col23", "col24", "col25", "col26", "col27", "col28", "col29", "col30", "col31",
                               "col32", "col33", "col34", "col35", "col36", "col37","col38","col39","col40", "col41"]).setStrategy("mean")

data = imputer.fit(data).transform(data)

# Scale numerical features
feature_cols = ["col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13", 
                "col14", "col15", "col16", "col17", "col18", "col19", "col20", "col21", "col22", 
                "col23", "col24", "col25", "col26", "col27", "col28", "col29", "col30", "col31", 
                "col32", "col33", "col34", "col35", "col36", "col37","col38","col39","col40", "col41"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
data = scaler.fit(data).transform(data)

# K-means clustering
kmeans = KMeans(k=80, seed=1, featuresCol="scaled_features", predictionCol="cluster")
model = kmeans.fit(data)
clusters = model.transform(data)

# Get cluster centers
centers = np.array(model.clusterCenters())

# Calculate distances from points to cluster centers
def distance_to_center(features, center):
    return float(np.sqrt(np.sum((np.array(features) - np.array(center)) ** 2)))

distance_udf = udf(lambda features: min([distance_to_center(features, center) for center in centers]), FloatType())

clusters = clusters.withColumn("distance_to_center", distance_udf(col("scaled_features")))

# Define anomaly if distance is greater than a threshold
threshold = 1.0  # Set a suitable threshold
clusters = clusters.withColumn("is_anomaly", col("distance_to_center") > threshold)

# Evaluating K-Means Clustering for anomalies
evaluator = ClusteringEvaluator(featuresCol="scaled_features", predictionCol="cluster")
silhouette = evaluator.evaluate(clusters)
print(f"Silhouette with squared Euclidean distance = {silhouette}")

# Print the number of anomalies and normal points
num_anomalies = clusters.filter(col("is_anomaly")).count()
num_normal = clusters.filter(~col("is_anomaly")).count()
total_count = clusters.count()

print(f"Number of anomalies: {num_anomalies}")
print(f"Number of normal points: {num_normal}")
print(f"Total number of points: {total_count}")

clusters.show(3)
