In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score
)




In [35]:
# ---------------------------
# 0. Parameters & Setup
# ---------------------------
dataset_path    = r"/Users/rooj/Documents/RP3-Main/RP3-Imp/clean-datasets/2018.csv"
sample_fraction = 1            # 0.5% of data
test_size       = 0.2             # train/test split
run_name        = "kmeans_100pct_80_20_trial1_k10"
output_folder   = os.path.join("Outputs", "2018", "k-means", run_name)
os.makedirs(output_folder, exist_ok=True)
print("Outputs will be saved to:", output_folder)


Outputs will be saved to: Outputs/2018/k-means/kmeans_100pct_80_20_trial1_k10


In [36]:
# ---------------------------
# 1. Load & Sample Data
# ---------------------------
data = pd.read_csv(dataset_path, low_memory=False)
# Drop any header‑rows misread as data
data = data[data['label'] != 'Label']
# Sample a fraction
data = data.sample(frac=sample_fraction, random_state=42).reset_index(drop=True)
print("Sampled data shape:", data.shape)

Sampled data shape: (4339650, 26)


In [37]:
# ---------------------------
# 2. Encode Labels
# ---------------------------
data['label'], uniques = pd.factorize(data['label'])
label_names = {i: lab for i, lab in enumerate(uniques)}
print("Encoded labels mapping:", label_names)

#----------------------------
# 3. Select Numeric Features & Clean (drop timestamp)
# First, if timestamp is present and numeric, drop it:
if 'timestamp' in data.columns:
    data = data.drop(columns=['timestamp'])

# Now pick up all remaining numeric cols except the label
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('label')
selected_features = numeric_cols
print("Selected features:", selected_features)
print("Count:", len(selected_features))  # should be 24

# Drop any rows with missing values in those features or label
df_clean = data[selected_features + ['label']].dropna().reset_index(drop=True)
X = df_clean[selected_features].values
y = df_clean['label'].values
print("Data after dropna:", X.shape)  # should be (n_samples, 24)


Encoded labels mapping: {0: 'Benign', 1: 'DoS attacks-Hulk', 2: 'DDoS attacks-LOIC-HTTP', 3: 'DDOS attack-HOIC', 4: 'DoS attacks-SlowHTTPTest', 5: 'DoS attacks-GoldenEye', 6: 'DoS attacks-Slowloris', 7: 'DDOS attack-LOIC-UDP'}
Selected features: ['Dst Port', 'protocol', 'Flow Duration', 'Flow Duration_rolling_mean', 'Flow Duration_rolling_std', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min', 'SYN Flag Cnt', 'pkts_ratio', 'byte_per_duration', 'entropy_pkt_len', 'Subflow Fwd Byts', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min']
Count: 24
Data after dropna: (4339650, 24)


In [38]:
# ---------------------------
# 4. Standardize & PCA
# ---------------------------
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
joblib.dump(scaler, os.path.join(output_folder, "scaler.pkl"))

pca = PCA(n_components=8).fit(X_scaled)
X_pca = pca.transform(X_scaled)
joblib.dump(pca, os.path.join(output_folder, "pca_model.pkl"))

['Outputs/2018/k-means/kmeans_100pct_80_20_trial1_k10/pca_model.pkl']

In [39]:
# ---------------------------
# 5. Train/Test Split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y,
    test_size=test_size,
    random_state=42,
    stratify=y
)
print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

Train size: 3471720, Test size: 867930


In [40]:
# ---------------------------
# 6. Fit KMeans (
# ---------------------------
fixed_k =10  # Set a fixed number of clusters for KMeans
print(f"Fitting KMeans with k={fixed_k}...")

km = KMeans(
    n_clusters=fixed_k,
    init='k-means++',
    n_init=1,
    random_state=42
).fit(X_train)
joblib.dump(km, os.path.join(output_folder, f"kmeans_k{fixed_k}.pkl"))

Fitting KMeans with k=10...


['Outputs/2018/k-means/kmeans_100pct_80_20_trial1_k10/kmeans_k10.pkl']

In [41]:

# ---------------------------
# 7. Cluster → Majority‐Label Mapping & Train Composition
# ---------------------------
train_clusters = km.labels_
cluster_map = {
    c: int(np.bincount(y_train[train_clusters == c]).argmax())
    for c in np.unique(train_clusters)
}

# Train cluster composition
df_train_comp = pd.DataFrame({'true': y_train, 'cluster': train_clusters})
train_comp = (
    df_train_comp
    .groupby('cluster')['true']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .rename(columns=lambda i: label_names[i])
)
print("\nTrain cluster composition (% of true labels per cluster):\n", train_comp)

# ---------------------------
# 8. Predict on Test
# ---------------------------
test_clusters = km.predict(X_test)
y_pred = np.array([cluster_map[c] for c in test_clusters])



Train cluster composition (% of true labels per cluster):
 true       Benign  DoS attacks-Hulk  DDoS attacks-LOIC-HTTP  DDOS attack-HOIC  \
cluster                                                                         
0        0.388184          0.000009                0.586079          0.000000   
1        0.700859          0.033345                0.000000          0.224972   
2        0.999996          0.000000                0.000000          0.000000   
3        0.999737          0.000000                0.000103          0.000000   
4        0.002120          0.000000                0.279809          0.000000   
5        0.013238          0.000000                0.940327          0.000000   
6        1.000000          0.000000                0.000000          0.000000   
7        0.022134          0.388344                0.000000          0.465034   
8        0.992555          0.000000                0.000000          0.000000   
9        0.967558          0.005800              

In [42]:
# ---------------------------
# 9. Accuracy, Report & Test Composition
# ---------------------------
# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {acc:.4f}")

# Classification Report
class_names = [label_names[i] for i in range(len(label_names))]
report = classification_report(
    y_test, y_pred,
    labels=list(cluster_map.values()),
    target_names=class_names,
    zero_division=0,
    digits=4
)
print("\n=== Classification Report ===\n", report)

# Confusion Matrix
cm = confusion_matrix(
    y_test, y_pred,
    labels=list(cluster_map.values())
)
print("=== Confusion Matrix ===\n", cm)

# Compute and print test cluster composition
df_test_comp = pd.DataFrame({'true': y_test, 'cluster': test_clusters})
test_comp = (
    df_test_comp
    .groupby('cluster')['true']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .rename(columns=lambda i: label_names[i])
)
print("\nTest cluster composition (% of true labels per cluster):\n", test_comp)


NameError: name 'accuracy_score' is not defined

In [None]:
# ---------------------------
# 10. Plot & Save Confusion Matrix
# ---------------------------
fig, ax = plt.subplots(figsize=(8,8))
im = ax.imshow(cm, cmap=plt.cm.Blues, interpolation='nearest')
ax.set_xticks(np.arange(len(class_names)))
ax.set_yticks(np.arange(len(class_names)))
ax.set_xticklabels(class_names, rotation=45, ha="right")
ax.set_yticklabels(class_names)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, cm[i, j], ha="center", va="center")
ax.set_xlabel("Predicted label")
ax.set_ylabel("True label")
ax.set_title("Confusion Matrix")
fig.tight_layout()

cm_path = os.path.join(output_folder, "confusion_matrix.png")
plt.savefig(cm_path, bbox_inches="tight")
plt.show()


In [None]:
# ---------------------------
# 11. AUC-ROC
# ---------------------------
n_classes = len(class_names)
y_test_bin = label_binarize(y_test, classes=list(range(n_classes)))
y_score = np.zeros((len(y_pred), n_classes))
for idx, p in enumerate(y_pred):
    y_score[idx, p] = 1
auc = roc_auc_score(y_test_bin, y_score, average="weighted", multi_class="ovr")
print(f"AUC-ROC: {auc:.4f}")


# ---------------------------
# 12. Save All Metrics to TXT
# ---------------------------
metrics_txt = os.path.join(output_folder, "test_evaluation_metrics.txt")
with open(metrics_txt, "w") as f:
    f.write(f"Test Accuracy: {acc:.4f}\n\n")
    f.write("=== Classification Report ===\n")
    f.write(report + "\n")
    f.write("=== Confusion Matrix ===\n")
    f.write(str(cm) + "\n\n")
    f.write(f"AUC-ROC: {auc:.4f}\n")

# Save cluster compositions
train_comp.to_csv(os.path.join(output_folder, "train_cluster_composition.csv"))
test_comp.to_csv(os.path.join(output_folder, "test_cluster_composition.csv"))

print("All evaluation metrics and compositions saved to:", output_folder)