In [25]:
import pandas as pd
import json
import os
from statistics import mode
from collections import Counter
import requests
import re

In [26]:
def load_config():
    # Get the absolute path of the project root (one directory up)
    project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))

    # Normalize the project_root to ensure it's correctly formatted
    project_root = os.path.normpath(project_root)
    
    config_path = os.path.join(project_root, 'config.json')

    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Config file not found at expected location: {config_path}")

    with open(config_path, 'r') as f:
        config = json.load(f)

    return config, project_root


In [27]:
config, project_root = load_config()

group_path_enterprise = os.path.normpath(os.path.join(project_root, config["data_directory"], config["file_paths_groups_v15"]["enterprise"]))
group_path_mobile = os.path.normpath(os.path.join(project_root, config["data_directory"], config["file_paths_groups_v15"]["mobile"]))
group_path_ics = os.path.normpath(os.path.join(project_root, config["data_directory"], config["file_paths_groups_v15"]["ics"]))

In [28]:
# Collect file paths into a list
group_file_paths = [group_path_enterprise, group_path_mobile, group_path_ics]

In [29]:
def apt_group_mapping_from_config(groups_file_paths, mapping_type='software'):
    """
    Reads multiple Excel files for APT groups and their software or technique mappings.
    Merges all mappings and provides usage statistics.

    Parameters
    ----------
    groups_file_paths : dict
        Dictionary of attack type -> Excel file path.
    mapping_type : str
        Either 'software' or 'techniques' to select the appropriate sheet.

    Returns
    -------
    dict
        Mapping of APT group IDs to names and software/techniques used.
    int
        Number of unique APT groups.
    int
        Number of unique software or techniques used.
    dict
        Statistical summary: mean, median, mode, max/min counts and groups.
    """
    # Select sheet based on mapping_type
    if mapping_type == 'software':
        sheet_name = 'associated software'
    elif mapping_type == 'techniques':
        sheet_name = 'techniques used'
    else:
        raise ValueError("Invalid mapping_type. Choose either 'software' or 'techniques'.")

    all_rows = []

    # Iterate over files and load data
    for file_path in groups_file_paths:
        xls = pd.ExcelFile(file_path)
        try:
            df = pd.read_excel(xls, sheet_name=sheet_name)
        except ValueError:
            raise ValueError(f"Sheet '{sheet_name}' not found in {file_path}")

        required_columns = {'source name', 'source ID', 'target ID'}
        if not required_columns.issubset(df.columns):
            raise ValueError(f"Missing required columns in {file_path}: {df.columns.tolist()}")

        # For techniques, also require the target name
        if mapping_type == 'techniques' and 'target name' not in df.columns:
            raise ValueError(f"Missing 'target name' column in {file_path} for techniques mapping")

        # Add the target name column if mapping_type is 'techniques'
        all_rows.append(df[['source name', 'source ID', 'target ID', 'target name', 'target ref']] 
                         if mapping_type == 'software' else df[['source name', 'source ID', 'target ID', 'target name']])

    # Combine all rows from different files
    merged_df = pd.concat(all_rows, ignore_index=True)

    # Build the mapping: group ID -> {name, targets, technique names}
    apt_map = {}
    grouped = merged_df.groupby('source ID')

    for group_id, group_df in grouped:
        group_name = group_df['source name'].iloc[0]
        targets = group_df['target ID'].dropna().unique().tolist()

        # For techniques mapping, we need to include the technique names as well
        if mapping_type == 'techniques':
            technique_names = group_df['target name'].fillna('').unique().tolist()
            apt_map[group_id] = {
                'name': group_name,
                'items': targets,
                'technique name': technique_names
            }
        elif mapping_type == 'software':
            # For software, we continue with the existing logic
            types = group_df['target ref'].apply(lambda x: 'Malware' if 'malware' in str(x).lower() else 'Tool')
            type_mapping = types.value_counts().to_dict()

            apt_map[group_id] = {
                'name': group_name,
                'items': targets,
                'type': type_mapping
            }

    # Count per group (number of unique targets)
    item_counts = merged_df.groupby('source ID')['target ID'].nunique()

    # Stats
    mean_count = item_counts.mean()
    median_count = item_counts.median()
    try:
        mode_count = mode(item_counts)
    except:
        mode_count = 'No unique mode'

    max_apt_group = item_counts.idxmax()
    max_count = item_counts[max_apt_group]

    min_apt_group = item_counts.idxmin()
    min_count = item_counts[min_apt_group]

    stats = {
        'mean_count': mean_count,
        'median_count': median_count,
        'mode_count': mode_count,
        'max_count': max_count,
        'max_apt_group': max_apt_group,
        'min_count': min_count,
        'min_apt_group': min_apt_group
    }

    return apt_map, merged_df['source ID'].nunique(), merged_df['target ID'].nunique(), stats


In [30]:
# Example call to the function for techniques mapping
techniques_map, unique_groups_technique, unique_techniques, stats_technique = apt_group_mapping_from_config(
    group_file_paths, mapping_type='techniques'
)

# Example call to the function for software mapping
software_map, unique_groups_software, unique_software, stats_software = apt_group_mapping_from_config(
    group_file_paths, mapping_type='software'
)


In [31]:
def groups_with_unique_techniques(technique_map):
    """
    Counts how many APT groups have at least one unique technique 
    (i.e., not used by any other group).

    Parameters
    ----------
    technique_map : dict
        Mapping of group ID to their techniques.

    Returns
    -------
    dict
        A summary including:
        - number of groups with unique techniques
        - group-to-unique-techniques map (up to 10 sample groups)
        - top 10 most common techniques (optional)
    """
    from collections import defaultdict, Counter

    technique_to_groups = defaultdict(set)
    group_to_techs = {}

    # Build reverse map: technique -> groups
    for group_id, data in technique_map.items():
        techniques = data.get("items", [])
        group_to_techs[group_id] = set(techniques)
        for tech in techniques:
            technique_to_groups[tech].add(group_id)

    # Identify unique techniques per group
    unique_per_group = {}
    for group_id, techs in group_to_techs.items():
        unique_techs = {t for t in techs if len(technique_to_groups[t]) == 1}
        if unique_techs:
            unique_per_group[group_id] = list(unique_techs)

    # Top 10 most used techniques (optional)
    technique_usage = Counter()
    for tech, groups in technique_to_groups.items():
        technique_usage[tech] = len(groups)
    top_10_common_techniques = technique_usage.most_common(10)

    # Print summary
    print(f"\nNumber of groups with unique techniques: {len(unique_per_group)}")
    print("\nExample groups with unique techniques:")
    for group_id, uniq_techs in list(unique_per_group.items())[:5]:  # limit to 5
        print(f"  Group {group_id} ({technique_map[group_id]['name']}):")
        for tech in uniq_techs[:5]:  # show up to 5 techniques
            print(f"    - {tech}")
        if len(uniq_techs) > 5:
            print(f"    ... and {len(uniq_techs) - 5} more")

    return {
        "num_groups_with_unique_techniques": len(unique_per_group),
        "unique_techniques_per_group": unique_per_group,
        "top_10_common_techniques": top_10_common_techniques
    }

In [32]:
#technique_stats = groups_with_unique_techniques(techniques_map)

In [55]:
def load_json_metadata(metadata_path):
    """
    Loads the group metadata JSON file.

    Parameters
    ----------
    metadata_path : str
        Path to the JSON file containing group metadata.

    Returns
    -------
    dict
        Group metadata loaded from the file.
    """
    with open(metadata_path, 'r', encoding='utf-8') as f:
        metadata = json.load(f)
    return metadata

In [56]:
# Path to group_metadata.json
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))  # move up from notebooks/
file_mitre_ttp = os.path.join(project_root, "group_analysis_json_outputs", "MITRE_ttp_group_analysis.json")
file_malpedia_ttp = os.path.join(project_root, "group_analysis_json_outputs", "Malpedia_ttp_group_analysis.json")

# Load the file
mitre_ttp_metadata = load_json_metadata(file_mitre_ttp)
malpedia_ttp_metadata = load_json_metadata(file_malpedia_ttp)



In [57]:
def compare_group_sets(techniques_map, ttp_data):
    """
    Compare group sets between techniques_map (MITRE) and ttp_data (Threat Reports).
    
    Prints:
    - Total groups in techniques_map
    - Total groups in ttp_data
    - Intersection (common groups)
    - Groups only in techniques_map
    - Groups only in ttp_data
    """
    techniques_groups = set(techniques_map.keys())
    ttp_groups = set(ttp_data.keys())

    intersection = techniques_groups & ttp_groups
    only_in_techniques = techniques_groups - ttp_groups
    only_in_ttp_data = ttp_groups - techniques_groups

    print(f"Total groups in techniques_map: {len(techniques_groups)}")
    print(f"Total groups in ttp_data (Threat Reports): {len(ttp_groups)}")
    print(f"Groups present in both (intersection): {len(intersection)}\n")

    print(f"Groups only in techniques_map (not observed in reports): {len(only_in_techniques)}")
    print(f"Groups only in ttp_data (not in MITRE mapping): {len(only_in_ttp_data)}\n")

    return {
        "techniques_groups": techniques_groups,
        "ttp_groups": ttp_groups,
        "intersection": intersection,
        "only_in_techniques": only_in_techniques,
        "only_in_ttp_data": only_in_ttp_data
    }


In [58]:
group_stats = compare_group_sets(techniques_map, mitre_ttp_metadata)

Total groups in techniques_map: 148
Total groups in ttp_data (Threat Reports): 63
Groups present in both (intersection): 62

Groups only in techniques_map (not observed in reports): 86
Groups only in ttp_data (not in MITRE mapping): 1



In [59]:
def enhance_techniques_map_with_observed_ttps(techniques_map, ttp_data):
    """
    Enhances the techniques_map by adding new TTPs from ttp_data.
    Renames 'items' key to 'techniques' for better clarity.
    
    Parameters
    ----------
    techniques_map : dict
        Original techniques mapping from MITRE ATT&CK Excel sheets.
    ttp_data : dict
        Observed TTPs from threat report hashes.

    Returns
    -------
    tuple
        - Updated techniques_map with observed TTPs added.
        - A diff dictionary showing newly added TTPs per group.
    """
    enhanced_map = {}

    diff_per_group = {}

    for group_id, group_info in techniques_map.items():
        # Rename 'items' to 'techniques'
        enhanced_map[group_id] = {
            "name": group_info.get("name", ""),
            "techniques": group_info.get("items", []),  # use existing items under new key
            "technique name": group_info.get("technique name", [])
        }

    for group_id, threat_data in ttp_data.items():
        observed_ttps = set()
        for hash_entry in threat_data.get('hashes', []):
            observed_ttps.update(t.upper().strip() for t in hash_entry.get('ttps', []))

        if group_id in enhanced_map:
            official_ttps = set(t.upper().strip() for t in enhanced_map[group_id].get('techniques', []))
            new_ttps = observed_ttps - official_ttps

            if new_ttps:
                enhanced_map[group_id]["techniques"].extend(new_ttps)
                diff_per_group[group_id] = list(new_ttps)

        else:
            # Group missing in MITRE, create new entry
            enhanced_map[group_id] = {
                "name": "",
                "techniques": list(observed_ttps),
                "technique name": []
            }
            diff_per_group[group_id] = list(observed_ttps)

    return enhanced_map, diff_per_group


In [60]:
def print_diff_summary(diff_per_group, techniques_map):
    """
    Prints a summary of newly added techniques per group.

    Parameters
    ----------
    diff_per_group : dict
        Mapping of group ID to list of newly added TTPs.
    techniques_map : dict
        Mapping to get the group names.
    """
    if not diff_per_group:
        print("No new TTPs were added.")
        return

    print(f"New TTPs added to {len(diff_per_group)} groups:\n")
    for group_id, new_ttps in diff_per_group.items():
        group_name = techniques_map.get(group_id, {}).get("name", "Unknown Group")
        print(f"Group {group_id} ({group_name}): {len(new_ttps)} new TTPs")
        for ttp in new_ttps[:5]:  # show up to 5 per group
            print(f"  - {ttp}")
        if len(new_ttps) > 5:
            print(f"  ... and {len(new_ttps) - 5} more")
        print()


In [61]:
enhanced_techniques_map, diff_per_group = enhance_techniques_map_with_observed_ttps(techniques_map, mitre_ttp_metadata)

#print_diff_summary(diff_per_group, enhanced_techniques_map)


In [62]:
# Function to normalize group names
def normalize_group_name(name):
    # Convert to lowercase for case-insensitive comparison
    name = name.lower().strip()

    # Remove 'team' from names like 'Sandworm Team'
    if name.endswith(' team'):
        name = name.replace(' team', '')

    # Replace 'threat group-' with 'tg-' (e.g., 'Threat Group-1314' -> 'TG-1314')
    name = re.sub(r'threat group[- ]', 'tg-', name)

    # Remove 'temp.' or similar prefixes (e.g., 'Temp.Pittytiger' -> 'Pittytiger')
    name = re.sub(r'^temp[\. ]+', '', name)

    # Normalize spaces and dots (e.g., 'pitty tiger' == 'pitty.tiger')
    name = re.sub(r'[\. ]+', ' ', name)

    # Remove common suffixes like 'framework' or 'group' (e.g., 'Inception Framework' -> 'Inception')
    name = re.sub(r' (framework|group)$', '', name)

    # Standardize 'Confucius' and 'Confucious' to 'confucius'
    name = re.sub(r'confucious', 'confucius', name)

    return name

In [63]:
def build_enhanced_techniques_map_with_group_aliases(techniques_map, group_metadata):
    """
    Builds an enhanced techniques map by enriching with MITRE and Malpedia metadata.
    Also prints a warning if any group ID is missing metadata.

    Parameters
    ----------
    techniques_map : dict
        Original or enhanced techniques map (already with techniques list).
    group_metadata : dict
        Metadata mapping containing MITRE and Malpedia names and aliases.

    Returns
    -------
    dict
        Enhanced techniques map with all fields populated.
    """
    enhanced_map = {}
    missing_metadata_groups = []

    for group_id, group_info in techniques_map.items():
        meta_info = group_metadata.get(group_id, {})
        
        if not meta_info:
            missing_metadata_groups.append(group_id)

        enhanced_map[group_id] = {
            'mitre name': meta_info.get('MITRE Group Name', ''),
            'mitre alias': meta_info.get('MITRE Associated Names', []),
            'malpedia name': meta_info.get('Malpedia Actor Name', ''),
            'malpedia alias': meta_info.get('Malpedia Aliases', []),
            'attack techniques': group_info.get('techniques', []),
            'technique name': group_info.get('technique name', [])
        }

    if missing_metadata_groups:
        print(f"Warning: Metadata not found for {len(missing_metadata_groups)} groups.")
        print(f"Examples: {', '.join(missing_metadata_groups[:10])}")  # show first 10 examples only

    return enhanced_map


In [64]:
# Path to group_metadata.json
metadata_file_group_name_map = os.path.join(project_root, "attack_malpedia_intersection", "attack_malpedia_group_mapping.json")

# Load metadata
group_name_metadata = load_json_metadata(metadata_file_group_name_map)

In [65]:
enhanced_techniques_map_alias = build_enhanced_techniques_map_with_group_aliases(enhanced_techniques_map, group_name_metadata)

Examples: G0028, G0089, G0108, G0114, G0124, G0140, G1027


In [71]:
def update_with_malpedia_ttps(enhanced_techniques_map_alias, malpedia_ttps):
    """
    Updates the enhanced_techniques_map by adding 'malpedia techniques' field
    based on matching names or aliases from malpedia_ttps.

    Parameters
    ----------
    enhanced_techniques_map : dict
        The main dictionary where each group has MITRE and Malpedia metadata.
    malpedia_ttps : dict
        Dictionary from malpedia_ttps.json, containing techniques per actor name.

    Returns
    -------
    dict
        Updated enhanced_techniques_map.
    """
    unmatched_malpedia_names = []

    for malpedia_group, malpedia_data in malpedia_ttps.items():
        malpedia_group_name = normalize_group_name(malpedia_group)

        # Collect all TTPs across all hashes
        all_ttps = set()
        for hash_entry in malpedia_data.get('hashes', []):
            all_ttps.update(hash_entry.get('ttps', []))

        matched_group_id = None

        for group_id, group_info in enhanced_techniques_map_alias.items():
            mitre_group_name = group_info.get('mitre name', '').lower().strip()
            mitre_aliases = [alias.lower().strip() for alias in group_info.get('mitre alias', [])]
            malpedia_actor_name = group_info.get('malpedia name', '').lower().strip()
            malpedia_aliases = [alias.lower().strip() for alias in group_info.get('malpedia alias', [])]

            if (mitre_group_name == malpedia_group_name or
                malpedia_group_name in mitre_aliases or
                malpedia_actor_name == malpedia_group_name or
                malpedia_group_name in malpedia_aliases):
                matched_group_id = group_id
                break

        if matched_group_id:
            enhanced_techniques_map_alias[matched_group_id]['malpedia techniques'] = list(all_ttps)
            print(matched_group_id, malpedia_group_name)
        else:
            unmatched_malpedia_names.append(malpedia_group_name)

    if unmatched_malpedia_names:
        print(f"Warning: {len(unmatched_malpedia_names)} Malpedia groups not matched to any MITRE group.")
        print(f"Examples: {', '.join(unmatched_malpedia_names[:33])} ...")

    return enhanced_techniques_map_alias


In [72]:
complete_technique_map = update_with_malpedia_ttps(enhanced_techniques_map_alias, malpedia_ttp_metadata)

G0008 fin7
G0092 ta505
G0094 kimsuky
G0038 stealth falcon
G1020 gold prelude
G0050 apt32
G0098 blacktech
G1007 aoqin dragon
G0078 the gorgon
G0010 turla
G0032 lazarus
G0128 apt31
G0102 wizard spider
G0127 gold cabin
G0016 apt29
G1016 fin13
G0059 apt35
G0069 muddywater
G0064 apt33
G0066 beijing
G0136 indigozebra
G0115 pinchy spider
G0067 apt37
G0034 sandworm
G0003 oilrig
G0081 pirate panda
G0068 platinum
G0135 backdoordiplomacy
G0052 copykittens
G0044 apt41
G0008 anunak
G0070 dark caracal
G0027 apt27
G0102 unc1878
G0080 cobalt
G0021 molerats
G0047 gamaredon
G0012 darkhotel
G0007 apt28
G0035 energetic bear
G0016 unc2452
G0001 apt17
G1028 aridviper
G0071 orangeworm
G0009 apt19
G0023 apt16
G0054 sowbug
G0138 silent chollima
G0026 apt18
G0065 apt40
G0006 apt1
G0081 apt23
G0053 fin5
G0056 promethium
G0126 higaisa
G0018 temper panda
G0060 tick
G0095 el machete
G1009 mosesstaff
G0106 rocke
G0129 mustang panda
G0121 razor tiger
G0087 apt39
G0019 naikon
G0117 fox kitten
G0048 rtm
G1001 lyceum
G0

In [74]:
len(malpedia_ttp_metadata), len(enhanced_techniques_map_alias) 

(591, 149)

In [31]:
#list(enhanced_techniques_map_alias.items())[:2]

In [27]:
cve_map_file = os.path.join(project_root, "group_analysis_json_outputs", "MITRE_cve_group_analysis.json")
cve_map_metdata = load_json_metadata(cve_map_file)


In [43]:
def load_cves_from_json(json_file):
    """
    Loads the CVE data from a JSON file and returns a dictionary of group IDs to associated CVEs.
    
    Parameters
    ----------
    json_file : str
        Path to the JSON file containing APT group CVE data.
    
    Returns
    -------
    dict
        A dictionary with group IDs as keys and a list of CVEs as values.
    """
    with open(json_file, 'r') as file:
        data = json.load(file)

    cve_map = {}
    for group_id, details in data.items():
        # Flatten the CVEs associated with all hashes for each group
        cves = set()
        for item in details.get('hashes', []):
            cves.update(item.get('cves', []))
        cve_map[group_id] = cves
    
    return cve_map

In [18]:
def compare_apt_groups_detailed(techniques_map, software_map, cve_map, group_ids):
    """
    Compare APT groups based on their techniques, software, and CVEs.
    Provides detailed differences including common, unique, and union, as well as the Jaccard Index.
    
    Parameters:
    ----------
    techniques_map : dict
        A mapping of APT group IDs to techniques used.
    software_map : dict
        A mapping of APT group IDs to software used.
    cve_map : dict
        A mapping of APT group IDs to CVEs.
    group_ids : list
        A list of APT group IDs to compare.

    Returns:
    -------
    dict
        A dictionary containing:
        - common_techniques: Common techniques used by all the selected groups.
        - common_software: Common software used by all the selected groups.
        - common_cves: Common CVEs between all the selected groups.
        - unique_techniques: Unique techniques for each group.
        - unique_software: Unique software for each group.
        - unique_cves: Unique CVEs for each group.
        - union_techniques: Union of all techniques used by the selected groups.
        - union_software: Union of all software used by the selected groups.
        - union_cves: Union of all CVEs used by the selected groups.
        - jaccard_index_techniques: Jaccard Index for techniques.
        - jaccard_index_software: Jaccard Index for software.
        - jaccard_index_cves: Jaccard Index for CVEs.
    """
    common_techniques = None
    common_software = None
    common_cves = None
    unique_techniques = {}
    unique_software = {}
    unique_cves = {}

    # To keep track of all techniques, software, and CVEs for union and jaccard index
    all_techniques_sets = []
    all_software_sets = []
    all_cves_sets = []

    # Loop over the selected group IDs
    for group_id in group_ids:
        if group_id not in techniques_map or group_id not in software_map:
            raise ValueError(f"Group ID {group_id} not found in techniques_map or software_map")

        # Get the techniques and software for the current group
        group_techniques = set(techniques_map.get(group_id, {}).get('items', []))
        group_software = set(software_map.get(group_id, {}).get('items', []))

        # Handle cases where a group might not have CVEs
        group_cves = cve_map.get(group_id, set())  # Default to an empty set if not found

        # Update common_techniques, common_software, and common_cves by intersecting with the first group
        if common_techniques is None:
            common_techniques = group_techniques
        else:
            common_techniques &= group_techniques

        if common_software is None:
            common_software = group_software
        else:
            common_software &= group_software

        if common_cves is None:
            common_cves = group_cves
        else:
            common_cves &= group_cves

        # Collect unique techniques, software, and CVEs for the current group
        unique_techniques[group_id] = group_techniques - common_techniques
        unique_software[group_id] = group_software - common_software
        unique_cves[group_id] = group_cves - common_cves

        # Collect sets of techniques, software, and CVEs for Jaccard index and union
        all_techniques_sets.append(group_techniques)
        all_software_sets.append(group_software)
        all_cves_sets.append(group_cves)

    # Union of all techniques, software, and CVEs
    union_techniques = set.union(*all_techniques_sets)
    union_software = set.union(*all_software_sets)
    union_cves = set.union(*all_cves_sets)

    # Jaccard Index for techniques, software, and CVEs
    jaccard_index_techniques = len(common_techniques) / len(union_techniques) if len(union_techniques) > 0 else 0
    jaccard_index_software = len(common_software) / len(union_software) if len(union_software) > 0 else 0
    jaccard_index_cves = len(common_cves) / len(union_cves) if len(union_cves) > 0 else 0

    # Prepare the result
    result = {
        'common_techniques': list(common_techniques),
        'common_software': list(common_software),
        'common_cves': list(common_cves),
        'unique_techniques': unique_techniques,
        'unique_software': unique_software,
        'unique_cves': unique_cves,
        'union_techniques': list(union_techniques),
        'union_software': list(union_software),
        'union_cves': list(union_cves),
        'jaccard_index_techniques': jaccard_index_techniques,
        'jaccard_index_software': jaccard_index_software,
        'jaccard_index_cves': jaccard_index_cves
    }

    return result


In [19]:
malpedia_mitre_map = {
    'apt17': ['G0025', 'G0001'],
    'apt19': ['G0073', 'G0009'],
    'apt30': ['G0013', 'G0030'],
    'lazarus': ['G0082', 'G0138', 'G0032'],
    'apt41': ['G0096', 'G0044'],
    'earth lusca': ['G0143', 'G1006'],
    'fin7': ['G0008', 'G0046'],
    'dragonok': ['G0017', 'G0002'],
    'mustang panda': ['G1014', 'G0129']
}

In [20]:
for actor, group_ids in malpedia_mitre_map.items():
    try:
        result = compare_apt_groups_detailed(techniques_map, software_map, cve_map, group_ids)

        print(f"Comparison for {actor} ({', '.join(group_ids)})")
        print(f"  - Common Techniques: {len(result['common_techniques'])}")
        print(f"  - Common Software: {len(result['common_software'])}")
        print(f"  - Common CVEs: {len(result['common_cves'])}")
        print(f"  - Jaccard (Techniques): {result['jaccard_index_techniques']:.2f}")
        print(f"  - Jaccard (Software): {result['jaccard_index_software']:.2f}")
        print(f"  - Jaccard (CVEs): {result['jaccard_index_cves']:.2f}")

    except ValueError as e:
        print(f"Skipping {actor} due to error: {e}")

Comparison for apt17 (G0025, G0001)
  - Common Techniques: 0
  - Common Software: 0
  - Common CVEs: 0
  - Jaccard (Techniques): 0.00
  - Jaccard (Software): 0.00
  - Jaccard (CVEs): 0.00
Comparison for apt19 (G0073, G0009)
  - Common Techniques: 3
  - Common Software: 0
  - Common CVEs: 0
  - Jaccard (Techniques): 0.30
  - Jaccard (Software): 0.00
  - Jaccard (CVEs): 0.00
Skipping apt30 due to error: Group ID G0030 not found in techniques_map or software_map
Comparison for lazarus (G0082, G0138, G0032)
  - Common Techniques: 7
  - Common Software: 0
  - Common CVEs: 0
  - Jaccard (Techniques): 0.07
  - Jaccard (Software): 0.00
  - Jaccard (CVEs): 0.00
Comparison for apt41 (G0096, G0044)
  - Common Techniques: 4
  - Common Software: 1
  - Common CVEs: 0
  - Jaccard (Techniques): 0.67
  - Jaccard (Software): 0.33
  - Jaccard (CVEs): 0.00
Comparison for earth lusca (G0143, G1006)
  - Common Techniques: 7
  - Common Software: 1
  - Common CVEs: 0
  - Jaccard (Techniques): 0.16
  - Jaccard

In [18]:
# URL of the MITRE ATT&CK Groups page
#url = "https://attack.mitre.org/groups/"
url = "https://attack.mitre.org/versions/v15/groups/"

# Send a GET request to fetch the page content
response = requests.get(url)
if response.status_code == 200:
    # Use regular expression to find all group IDs (e.g., G0001, G1000)
    group_ids = re.findall(r'G\d{4}', response.text)
    unique_group_ids = sorted(set(group_ids))
    
    print(f"Total groups found: {len(unique_group_ids)}")
    #print("Group IDs:")
    #for gid in unique_group_ids:
    #    print(gid)
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


Total groups found: 152


In [None]:
# Assuming 'parsed_group_ids' is a set of group IDs from your Excel data
parsed_group_ids = set(techniques_map.keys())

# 'unique_group_ids' is the set obtained from the MITRE website
missing_in_parsed = set(unique_group_ids) - parsed_group_ids
extra_in_parsed = parsed_group_ids - set(unique_group_ids)

print("Group IDs missing in parsed data:", missing_in_parsed)
print("Unexpected group IDs in parsed data:", extra_in_parsed)