In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

def hfc_pipeline(X, y=None, clustering_method=None, contrast_threshold=0.5):
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

    # Default to KMeans if no clustering method is passed
    if clustering_method is None:
        clustering_method = KMeans(n_clusters=2, random_state=42)

    # Fit clustering algorithm
    clusters = clustering_method.fit_predict(X_scaled)

    global_mean = X_scaled.mean()
    global_std = X_scaled.std()
    motif_list = []

    for k in sorted(pd.Series(clusters).unique()):
        cluster_data = X_scaled[pd.Series(clusters) == k]
        cluster_mean = cluster_data.mean()
        contrast_score = ((cluster_mean - global_mean).abs() / global_std).fillna(0)
        selected_features = contrast_score[contrast_score > contrast_threshold].index.tolist()

        if selected_features:
            motif_feature = pd.Series(1, index=X_scaled.index)
            for feature in selected_features:
                motif_feature &= (X_scaled[feature] > cluster_mean[feature]).astype(int)
            motif_feature.name = f'hfc_chord_c{k}'
            motif_list.append(motif_feature)

    if motif_list:
        motifs_df = pd.concat(motif_list, axis=1)
        final_df = pd.concat([X_scaled, motifs_df], axis=1)
    else:
        final_df = X_scaled.copy()

    if y is not None:
        final_df['label'] = y.values

    return final_df

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats import variation, zscore


def compute_feature_stats(X_scaled: pd.DataFrame) -> pd.DataFrame:
    """
    Computes per-feature stats: std dev, mean, coefficient of variation, and z-score mean.
    """
    return pd.DataFrame({
        "std_dev": X_scaled.std(),
        "mean": X_scaled.mean(),
        "coefficient_of_variation": variation(X_scaled, axis=0),
        "zscore_mean": zscore(X_scaled, axis=0).mean(axis=0)
    }).sort_values(by='coefficient_of_variation', ascending=False)


def suggest_hfc_parameters(stats_df: pd.DataFrame, cov_threshold: float = 1.0):
    """
    Auto-selects Top-N high-variance features and Min-Votes threshold.
    """
    top_n = (stats_df['coefficient_of_variation'] > cov_threshold).sum()
    min_votes = max(2, int(np.ceil(0.5 * top_n)))
    return top_n, min_votes


def hfc_pipeline_with_voting(X, y=None, cluster_labels=None, contrast_threshold: float = 0.5, min_votes: int = None):
    """
    Generates motifs (HFC chords) using voting-based rule activation and contrast scoring.
    """
    if cluster_labels is None:
        raise ValueError("You must provide precomputed cluster_labels.")

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

    global_mean = X_scaled.mean()
    global_std = X_scaled.std()
    motif_list = []
    chord_feature_map = {}
    chord_rule_map = {}
    chord_coverage = {}

    clusters = pd.Series(cluster_labels, index=X_scaled.index)

    for k in sorted(clusters.unique()):
        cluster_data = X_scaled[clusters == k]
        cluster_mean = cluster_data.mean()
        contrast_score = ((cluster_mean - global_mean).abs() / global_std).fillna(0)

        selected_features = contrast_score[contrast_score > contrast_threshold].index.tolist()
        if len(selected_features) == 0:
            continue

        # Determine min_votes if not explicitly given
        actual_min_votes = min_votes if min_votes is not None else max(2, int(np.ceil(0.5 * len(selected_features))))

        # Binary matrix: 1 if instance > cluster mean for selected feature
        condition_matches = pd.DataFrame({
            f: (X_scaled[f] > cluster_mean[f]).astype(int)
            for f in selected_features
        })

        votes = condition_matches.sum(axis=1)
        motif_series = (votes >= actual_min_votes).astype(int)
        chord_name = f"hfc_chord_c{k}"

        motif_series.name = chord_name
        motif_list.append(motif_series)

        # Save feature list, coverage, and rule
        chord_feature_map[chord_name] = selected_features
        chord_rule_map[chord_name] = f"Match if ≥ {actual_min_votes} of:\n" + "\n".join(
            [f"{f} > {cluster_mean[f]:.3f}" for f in selected_features])
        chord_coverage[chord_name] = int(motif_series.sum())

    # Build motifs DataFrame
    if motif_list:
        motifs_df = pd.concat(motif_list, axis=1)
    else:
        motifs_df = pd.DataFrame(index=X.index)

    if y is not None:
        motifs_df['label'] = y.values

    return motifs_df, chord_feature_map, chord_rule_map, chord_coverage


In [18]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from src.utils.unsw_loader import load_unsw_datasets


train_path = "data/UNSW_NB15_training-set.csv"
test_path = "data/UNSW_NB15_testing-set.csv"
    
X_train, y_train, X_test, y_test = load_unsw_datasets(train_path, test_path)


# 1. Scale first
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

# 2. Get top-N and min-votes
feature_stats = compute_feature_stats(X_scaled)
top_n, min_votes = suggest_hfc_parameters(feature_stats, cov_threshold=1.0)

# 3. Cluster and generate motifs
clusters = KMeans(n_clusters=9, random_state=42).fit_predict(X_train)
motifs_df, chord_map, rule_map, coverage_map = hfc_pipeline_with_voting(
    X_train, y_train, cluster_labels=clusters,
    contrast_threshold=0.5, min_votes=min_votes
)

# 4. Display
print(f"Auto-tuned Top-N: {top_n}, Min-Votes: {min_votes}")
for chord in chord_map:
    print(f"\n{chord} — Features: {chord_map[chord]}")
    print(f"Rule:\n{rule_map[chord]}")
    print(f"Coverage: {coverage_map[chord]} samples ({coverage_map[chord]/len(X_train)*100:.2f}%)")

Auto-tuned Top-N: 21, Min-Votes: 11

hfc_chord_c0 — Features: ['rate', 'dttl', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'dmean', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_srv_dst']
Rule:
Match if ≥ 11 of:
rate > -0.572
dttl > 0.855
swin > 1.092
stcpb > 0.501
dtcpb > 2.054
dwin > 1.103
tcprtt > 0.647
synack > 0.599
ackdat > 0.626
dmean > 0.518
ct_srv_src > -0.511
ct_state_ttl > -0.657
ct_dst_sport_ltm > -0.544
ct_dst_src_ltm > -0.554
ct_srv_dst > -0.524
Coverage: 1258 samples (0.72%)

hfc_chord_c1 — Features: ['dttl', 'swin', 'stcpb', 'dtcpb', 'dwin', 'ct_state_ttl']
Rule:
Match if ≥ 11 of:
dttl > -0.642
swin > -0.830
stcpb > -0.707
dtcpb > -0.708
dwin > -0.838
ct_state_ttl > 0.502
Coverage: 0 samples (0.00%)

hfc_chord_c2 — Features: ['rate', 'dttl', 'swin', 'stcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_srv_dst']
Rule:
Match if ≥ 11 of:
rate > -0.573
dttl > 0.94

In [22]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Simulate example structures from user's HFC pipeline
# These should be replaced with real values from your actual execution context

# Example rule map
rule_map = {
    "hfc_chord_c0": "Match if ≥ 11 of:\nrate > -0.572\n...",
    "hfc_chord_c5": "Match if ≥ 11 of:\nrate > -0.571\n...",
    "hfc_chord_c7": "Match if ≥ 11 of:\nrate > -0.571\n...",
    "hfc_chord_c8": "Match if ≥ 11 of:\nrate > -0.572\n..."
}

# Example coverage map
coverage_map = {
    "hfc_chord_c0": 1258,
    "hfc_chord_c5": 3236,
    "hfc_chord_c7": 5760,
    "hfc_chord_c8": 3105
}

# Total samples (replace with actual value)
total_samples = 175000  # Example

# Build exportable DataFrame
chord_df = pd.DataFrame({
    "Chord": list(rule_map.keys()),
    "Rule": [rule_map[ch] for ch in rule_map],
    "Coverage Count": [coverage_map.get(ch, 0) for ch in rule_map],
    "Coverage (%)": [coverage_map.get(ch, 0) / total_samples * 100 for ch in rule_map]
})

# Save to CSV
csv_path = "data/hfc_chord_summary.csv"
chord_df.to_csv(csv_path, index=False)

# Build and display heatmap matrix for visualization (synthetic here)
# Rows: Chords, Columns: Features (simplified dummy binary matrix)
feature_matrix = pd.DataFrame({
    "rate": [1, 1, 1, 1],
    "dttl": [1, 1, 1, 1],
    "swin": [1, 1, 1, 1],
    "dtcpb": [1, 1, 1, 1],
    "ct_srv_dst": [1, 1, 1, 1]
}, index=["hfc_chord_c0", "hfc_chord_c5", "hfc_chord_c7", "hfc_chord_c8"])

# Plot heatmap
plt.figure(figsize=(10, 4))
sns.heatmap(feature_matrix, annot=True, cmap="YlGnBu", cbar=False)
plt.title("Chord vs. Feature Presence")
plt.ylabel("HFC Chord")
plt.xlabel("Feature")
heatmap_path = "data/hfc_chord_feature_heatmap.png"
plt.tight_layout()
plt.savefig(heatmap_path)
plt.close()

(csv_path, heatmap_path)


('data/hfc_chord_summary.csv', 'data/hfc_chord_feature_heatmap.png')