In [None]:
# %% [markdown]
# # K–Means Seeded Model Testing
# This notebook loads a pre-trained seeded K–Means model along with the corresponding scaler and PCA clustering model.
# It then processes a specified fraction of a new dataset, applies the same feature transformations,
# predicts cluster labels, computes evaluation metrics (e.g., silhouette score, confusion matrix, classification report),
# and saves the evaluation outputs.

# %% [code]
import os
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (confusion_matrix, classification_report, 
                             silhouette_score)
from sklearn.model_selection import train_test_split

In [None]:
# %% [markdown]
# ## Parameter Definition
# Adjust these parameters as needed.
dataset_path = r"C:\Users\mrroo\Desktop\RP3\datasets\2018.csv"  # Update path as needed
sample_fraction = 0.03  # e.g., use 3% of the data for testing
run_name = "kmeans_seeded_test_run_full"  # Change as desired
year = "2018"
model_name = "KMeans_Seeded"

# Define the output folder for this test run
output_folder = os.path.join("outputs", year, model_name, run_name)
os.makedirs(output_folder, exist_ok=True)
print("Output folder created:", output_folder)

In [None]:
# %% [markdown]
# ## Load Saved Model Artifacts
# Update the following paths to point to your saved artifacts.
seeded_model_path = r"C:\Users\mrroo\Desktop\RP3\Imp\Outputs\2018\k-means\run1-20%\seeded_kmeans_model.pkl"  # Update path
pca_model_path = r"C:\Users\mrroo\Desktop\RP3\Imp\Outputs\2018\k-means\run1-20%\pca_clustering_model.pkl"  # Update path
scaler_path = r"C:\Users\mrroo\Desktop\RP3\Imp\Outputs\2018\k-means\run1-20%\scaler.pkl"  # Update path

seeded_kmeans_model = joblib.load(seeded_model_path)
pca_clustering = joblib.load(pca_model_path)
scaler_loaded = joblib.load(scaler_path)

In [None]:
# %% [markdown]
# ## Data Loading & Preprocessing
data = pd.read_csv(dataset_path)
print("Original dataset shape:", data.shape)
print("First few rows:")
print(data.head())
print("\nDataset info:")
print(data.info())
if 'label' in data.columns:
    print("\nClass distribution:")
    print(data['label'].value_counts())

# Sample a fraction of the dataset for testing
data = data.sample(frac=sample_fraction, random_state=42).reset_index(drop=True)
print(f"Dataset shape after sampling {sample_fraction*100:.0f}%:", data.shape)

# Remove misread header rows (if any)
data = data[data['label'] != 'label']

In [None]:
# %% [markdown]
# ### Encode Labels (if available)
if 'label' in data.columns:
    data['label'], uniques = pd.factorize(data['label'])
    label_names = {i: label for i, label in enumerate(uniques)}
    print("\nEncoded labels mapping:")
    print(label_names)
else:
    print("No 'label' column found; skipping label encoding.")

# %% [markdown]
# ### Feature Selection
# Option 1: Manually define features (if desired)
manual_features = ['pkts_ratio', 'Tot Bwd Pkts', 'Dst Port', 'Fwd Pkt Len Max',
                   'SYN Flag Cnt', 'byte_per_duration', 'Bwd Pkt Len Min', 'protocol',
                   'Fwd Pkt Len Mean', 'TotLen Bwd Pkts', 'Fwd Pkt Len Min', 'Flow Duration',
                   'Flow Duration_rolling_std', 'Flow Duration_rolling_mean', 'Bwd Pkt Len Max',
                   'Fwd IAT Tot', 'TotLen Fwd Pkts', 'Subflow Fwd Byts', 'Fwd IAT Max',
                   'Tot Fwd Pkts', 'Fwd IAT Min', 'Fwd IAT Mean', 'Fwd Pkt Len Std',
                   'entropy_pkt_len']

try:
    selected_features = manual_features
except NameError:
    selected_features = [col for col in data.columns if col.lower() not in ['timestamp', 'label']]

print("\nSelected features:")
print(selected_features)

X = data[selected_features]
if 'label' in data.columns:
    y = data['label']

# Drop rows with missing feature values
num_rows_before = X.shape[0]
X = X.dropna()
num_rows_after = X.shape[0]
print("Number of rows dropped due to missing values:", num_rows_before - num_rows_after)

In [None]:
# %% [markdown]
# ### Standardize Features
# IMPORTANT: Ensure the order of features matches what was used during training.
if hasattr(scaler_loaded, "feature_names_in_"):
    correct_feature_order = list(scaler_loaded.feature_names_in_)
    print("Feature order from training:", correct_feature_order)
    X = X[correct_feature_order]
else:
    X = X[selected_features]

X_scaled = scaler_loaded.transform(X)
print("Scaled features shape:", X_scaled.shape)

In [None]:
# %% [markdown]
# ### Apply PCA Transformation
# Transform the test data using the saved PCA clustering model.
X_pca_test = pca_clustering.transform(X_scaled)
print("Transformed test features shape:", X_pca_test.shape)

# %% [markdown]
# ## Model Evaluation
# Predict cluster labels using the loaded seeded K–Means model.
predicted_clusters = seeded_kmeans_model.predict(X_pca_test)
print("Predicted cluster labels shape:", predicted_clusters.shape)

# Compute silhouette score on PCA-transformed test data.
sil_score = silhouette_score(X_pca_test, predicted_clusters)
print("Silhouette Score:", sil_score)

# If ground truth labels are available, compute confusion matrix and classification report.
if 'label' in data.columns:
    y_true = y  # Ground truth labels from test data
    cm = confusion_matrix(y_true, predicted_clusters)
    print("\nConfusion Matrix:")
    print(cm)
    
    unique_classes = np.union1d(np.unique(y_true), np.unique(predicted_clusters))
    target_names_test = [str(cls) for cls in unique_classes]
    
    report = classification_report(
        y_true,
        predicted_clusters,
        labels=unique_classes,
        target_names=target_names_test,
        zero_division=0
    )
    print("\nClassification Report:")
    print(report)
else:
    print("No ground truth labels available; skipping confusion matrix and classification report.")

In [None]:
# %% [markdown]
# ### Plot Confusion Matrix (if ground truth is available)
if 'label' in data.columns:
    def plot_confusion_matrix(cm, classes, title="Confusion Matrix"):
        fig, ax = plt.subplots(figsize=(10, 10))
        im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        ax.figure.colorbar(im, ax=ax)
        tick_marks = np.arange(len(classes))
        ax.set_xticks(tick_marks)
        ax.set_xticklabels(classes, rotation=45, fontsize=12)
        ax.set_yticks(tick_marks)
        ax.set_yticklabels(classes, fontsize=12)
        ax.set_xlabel("Predicted label", fontsize=14)
        ax.set_ylabel("True label", fontsize=14)
        ax.set_title(title, fontsize=16)
        thresh = cm.max() / 2.0
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, format(cm[i, j], "d"),
                        ha="center", va="center",
                        color="white" if cm[i, j] > thresh else "black",
                        fontsize=12)
        fig.tight_layout()
        return fig

    fig_cm = plot_confusion_matrix(cm, target_names_test)
    plt.show()
else:
    print("Skipping confusion matrix plot as no ground truth labels are available.")

In [None]:
# %% [markdown]
# ## Save Evaluation Outputs
results_txt_path = os.path.join(output_folder, "evaluation_report.txt")
with open(results_txt_path, "w") as f:
    f.write("Evaluation Metrics\n")
    f.write("==================\n\n")
    f.write("Silhouette Score: {:.4f}\n\n".format(sil_score))
    if 'label' in data.columns:
        f.write("Classification Report:\n")
        f.write(report)
        f.write("\nConfusion Matrix:\n")
        f.write(np.array2string(cm, separator=", "))
print("Evaluation report saved to:", results_txt_path)

if 'label' in data.columns:
    cm_df = pd.DataFrame(cm, index=target_names_test, columns=target_names_test)
    cm_csv_path = os.path.join(output_folder, "confusion_matrix.csv")
    cm_df.to_csv(cm_csv_path, index=True)
    print("Confusion matrix saved as CSV:", cm_csv_path)

sil_score_path = os.path.join(output_folder, "silhouette_score.txt")
with open(sil_score_path, "w") as f:
    f.write("Silhouette Score: {:.4f}\n".format(sil_score))
print("Silhouette score saved to:", sil_score_path)