In [None]:
v

In [None]:
# %%
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# %%
# Load the dataset
data = pd.read_csv("/Users/rooj/Documents/RP3-Main/RP3-Data/CSE-CIC-IDS2018/DoS-Related/02-15-2018.csv")  # Update with your dataset's path

# Preview the data
print(data.head())
print(data.info())
print(data['Label'].value_counts())

In [None]:
# %%
# Encode the Labels
label_mapping = {'Benign': 0, 'DoS attacks-GoldenEye': 1, 'DoS attacks-Slowloris': 2}
data['Label'] = data['Label'].map(label_mapping)

# Filter for relevant classes
data = data[data['Label'].isin([0, 1, 2])]
print("Encoded class distribution:\n", data['Label'].value_counts())

# %%
# Feature Selection
selected_features = [
    'Flow Duration', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Std', 
    'Pkt Size Avg', 'Flow IAT Mean'
]
X = data[selected_features]

In [None]:
# %%
# Handle missing values
X = X.dropna()

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# %%
# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 components for visualization
X_pca = pca.fit_transform(X_scaled)
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')

In [None]:
# %%
# Determine the optimal number of clusters using the Elbow Method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_pca)
    wcss.append(kmeans.inertia_)

# Plot the Elbow graph
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


In [None]:
# %%
# From the Elbow plot, choose the optimal number of clusters, e.g., 3
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', random_state=42)
cluster_labels = kmeans.fit_predict(X_pca)

# Add cluster labels to the original data
data['Cluster'] = cluster_labels

In [None]:
# %%
# Evaluate clustering performance using Silhouette Score
sil_score = silhouette_score(X_pca, cluster_labels)
print(f'Silhouette Score: {sil_score}')

# %%
# Visualize the clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=cluster_labels, palette='viridis', s=50)
plt.title('Clusters after K-Means with PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()

# %%
# Analyze the composition of each cluster
cluster_composition = data.groupby('Cluster')['Label'].value_counts(normalize=True).unstack().fillna(0)
print(cluster_composition)

# %%
# Save the PCA and KMeans models for future use
import joblib
joblib.dump(pca, 'pca_model.pkl')
joblib.dump(kmeans, 'kmeans_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Models saved successfully.")