In [54]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import pairwise_distances
import umap
import hdbscan

In [63]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [55]:
##!python -m pip install --user scikit-learn hdbscan umap-learn matplotlib pandas

In [56]:
file_path =r"mitre_techniques_map.json"

In [57]:
def summarize_group_data(json_path):
    """Summarize number of groups, unique TTPs, and unique software.

    Parameters
    ----------
    json_path : str
        Path to JSON file containing group data.

    Returns
    -------
    dict
        Summary with total groups, unique techniques, and unique software.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    group_ids = list(data.keys())
    all_techniques = set()

    for group in data.values():
        all_techniques.update(group.get('items', []))

    summary = {
        'total_groups': len(group_ids),
        'unique_attack_techniques': len(all_techniques),
        'unique_attack_techniques_list': sorted(all_techniques)
    }

    return summary

# Example usage
if __name__ == '__main__':
    path = file_path
    stats = summarize_group_data(path)
    
    print(f"Total groups: {stats['total_groups']}")
    print(f"Unique attack techniques: {stats['unique_attack_techniques']}")

Total groups: 148
Unique attack techniques: 451


In [86]:
def load_group_technique_data(json_path):
    """
    Loads group technique data from JSON.

    Parameters
    ----------
    json_path : str

    Returns
    -------
    dict
        group_id -> list of technique IDs
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    return {group_id: group.get('items', []) for group_id, group in data.items()}

def one_hot_encode_techniques(group_techniques):
    """
    One-hot encodes the technique lists per group.

    Parameters
    ----------
    group_techniques : dict
        Mapping of group_id -> list of technique IDs

    Returns
    -------
    DataFrame
        Binary feature matrix, rows=groups, columns=techniques
    """
    mlb = MultiLabelBinarizer()
    encoded = mlb.fit_transform(group_techniques.values())
    feature_df = pd.DataFrame(encoded, index=group_techniques.keys(), columns=mlb.classes_)
    return feature_df

def cluster_groups_with_hdbscan(feature_matrix, min_cluster_size=2):
    """
    Clusters group vectors using HDBSCAN.

    Parameters
    ----------
    feature_matrix : DataFrame
        One-hot encoded group x technique matrix
    min_cluster_size : int
        Minimum cluster size for HDBSCAN

    Returns
    -------
    Series
        Cluster labels indexed by group ID
    """

    #return pd.Series(labels, index=feature_matrix.index)
    cluster_results = {}
    
    for size in [2, 3, 5]:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=size, metric='jaccard')
        labels = clusterer.fit_predict(feature_matrix)
        labels_series = pd.Series(labels, index=feature_matrix.index)
    
        # Store for later analysis
        cluster_results[size] = labels_series
    
        n_clusters = len(set(labels) - {-1})
        n_noise = (labels == -1).sum()
    
        print(f"min_cluster_size = {size}")
        print(f"Number of clusters (excluding noise): {n_clusters}")
        print(f"Groups marked as noise: {n_noise}")
        print("Cluster label counts:")
        for label, count in labels_series.value_counts().items():
            print(f"  Cluster {label}: {count}")
        print("-" * 30)

    return cluster_results





In [87]:
# === Run end-to-end ===
json_path = "mitre_techniques_map.json"
group_techniques = load_group_technique_data(json_path)
feature_matrix = one_hot_encode_techniques(group_techniques)
label_dict = cluster_groups_with_hdbscan(feature_matrix)
labels_for_5 = label_dict[5]


# # Print cluster summary
# print(f"\nTotal groups: {len(labels)}")
# print(f"Number of clusters (excluding noise): {len(set(labels) - {-1})}")
# print(f"Groups marked as noise: {(labels == -1).sum()}")
# print("\nCluster label counts:\n", labels.value_counts())

min_cluster_size = 2
Number of clusters (excluding noise): 2
Groups marked as noise: 43
Cluster label counts:
  Cluster 0: 102
  Cluster -1: 43
  Cluster 1: 3
------------------------------
min_cluster_size = 3
Number of clusters (excluding noise): 2
Groups marked as noise: 118
Cluster label counts:
  Cluster -1: 118
  Cluster 0: 27
  Cluster 1: 3
------------------------------
min_cluster_size = 5
Number of clusters (excluding noise): 2
Groups marked as noise: 124
Cluster label counts:
  Cluster -1: 124
  Cluster 1: 19
  Cluster 0: 5
------------------------------


In [88]:
# Load group technique data
with open("mitre_techniques_map.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Get cluster labels for min_cluster_size=5
labels_for_5 = label_dict[5]  # assuming you used Option 2 in the function

# Inspect clusters 0 and 1 (excluding noise: -1)
for cluster_id in sorted(set(labels_for_5) - {-1}):
    group_ids = labels_for_5[labels_for_5 == cluster_id].index

    print(f"\n=== Cluster {cluster_id} ({len(group_ids)} groups) ===")
    for group_id in group_ids:
        group_info = data.get(group_id, {})
        name = group_info.get("name", "Unknown")
        techniques = group_info.get("items", [])
        
        print(f"\nGroup ID: {group_id}")
        print(f"Name: {name}")
        print(f"Techniques ({len(techniques)}): {sorted(techniques)}")
        print("-" * 50)



=== Cluster 0 (5 groups) ===

Group ID: G0045
Name: menuPass
Techniques (47): ['T1003.002', 'T1003.003', 'T1003.004', 'T1005', 'T1016', 'T1018', 'T1021.001', 'T1021.004', 'T1027.013', 'T1036', 'T1036.003', 'T1036.005', 'T1039', 'T1046', 'T1047', 'T1049', 'T1053.005', 'T1055.012', 'T1056.001', 'T1059.001', 'T1059.003', 'T1070.003', 'T1070.004', 'T1074.001', 'T1074.002', 'T1078', 'T1083', 'T1087.002', 'T1090.002', 'T1105', 'T1106', 'T1119', 'T1140', 'T1190', 'T1199', 'T1204.002', 'T1210', 'T1218.004', 'T1553.002', 'T1560', 'T1560.001', 'T1566.001', 'T1568.001', 'T1574.001', 'T1574.002', 'T1583.001', 'T1588.002']
--------------------------------------------------

Group ID: G0050
Name: APT32
Techniques (78): ['T1003', 'T1003.001', 'T1012', 'T1016', 'T1018', 'T1021.002', 'T1027.001', 'T1027.010', 'T1027.011', 'T1027.013', 'T1033', 'T1036', 'T1036.003', 'T1036.004', 'T1036.005', 'T1041', 'T1046', 'T1047', 'T1048.003', 'T1049', 'T1053.005', 'T1055', 'T1056.001', 'T1059', 'T1059.001', 'T1059