In [1]:
import pandas as pd
import json
import os
from statistics import mode
from collections import Counter


# Load configuration
config_file_path = os.path.join("..", "Malpedia Bib files Analysis", "config.json")

with open(config_file_path, "r") as config_file:
    config = json.load(config_file)

data_directory = config["data_directory"]
groups_file_paths = {
    key: os.path.normpath(os.path.join(data_directory, value))
    for key, value in config["file_paths_groups_v15"].items()
}

In [2]:
groups_file_paths

{'enterprise': '..\\ATTACK Excel sheets\\enterprise-attack-v15.1-groups.xlsx',
 'ics': '..\\ATTACK Excel sheets\\ics-attack-v15.1-groups.xlsx',
 'mobile': '..\\ATTACK Excel sheets\\mobile-attack-v15.1-groups.xlsx'}

In [3]:
def apt_group_mapping_from_config(groups_file_paths, mapping_type='software'):
    """
    Reads multiple Excel files for APT groups and their software or technique mappings.
    Merges all mappings and provides usage statistics.

    Parameters
    ----------
    groups_file_paths : dict
        Dictionary of attack type -> Excel file path.
    mapping_type : str
        Either 'software' or 'techniques' to select the appropriate sheet.

    Returns
    -------
    dict
        Mapping of APT group IDs to names and software/techniques used.
    int
        Number of unique APT groups.
    int
        Number of unique software or techniques used.
    dict
        Statistical summary: mean, median, mode, max/min counts and groups.
    """
    # Select sheet based on mapping_type
    if mapping_type == 'software':
        sheet_name = 'associated software'
    elif mapping_type == 'techniques':
        sheet_name = 'techniques used'
    else:
        raise ValueError("Invalid mapping_type. Choose either 'software' or 'techniques'.")

    all_rows = []

    # Iterate over files and load data
    for attack_type, file_path in groups_file_paths.items():
        xls = pd.ExcelFile(file_path)
        try:
            df = pd.read_excel(xls, sheet_name=sheet_name)
        except ValueError:
            raise ValueError(f"Sheet '{sheet_name}' not found in {file_path}")

        required_columns = {'source name', 'source ID', 'target ID'}
        if not required_columns.issubset(df.columns):
            raise ValueError(f"Missing required columns in {file_path}: {df.columns.tolist()}")

        # For techniques, also require the target name
        if mapping_type == 'techniques' and 'target name' not in df.columns:
            raise ValueError(f"Missing 'target name' column in {file_path} for techniques mapping")

        # Add the target name column if mapping_type is 'techniques'
        all_rows.append(df[['source name', 'source ID', 'target ID', 'target name', 'target ref']] 
                         if mapping_type == 'software' else df[['source name', 'source ID', 'target ID', 'target name']])

    # Combine all rows from different files
    merged_df = pd.concat(all_rows, ignore_index=True)

    # Build the mapping: group ID -> {name, targets, technique names}
    apt_map = {}
    grouped = merged_df.groupby('source ID')

    for group_id, group_df in grouped:
        group_name = group_df['source name'].iloc[0]
        targets = group_df['target ID'].dropna().unique().tolist()

        # For techniques mapping, we need to include the technique names as well
        if mapping_type == 'techniques':
            technique_names = group_df['target name'].fillna('').unique().tolist()
            apt_map[group_id] = {
                'name': group_name,
                'items': targets,
                'technique name': technique_names
            }
        elif mapping_type == 'software':
            # For software, we continue with the existing logic
            types = group_df['target ref'].apply(lambda x: 'Malware' if 'malware' in str(x).lower() else 'Tool')
            type_mapping = types.value_counts().to_dict()

            apt_map[group_id] = {
                'name': group_name,
                'items': targets,
                'type': type_mapping
            }

    # Count per group (number of unique targets)
    item_counts = merged_df.groupby('source ID')['target ID'].nunique()

    # Stats
    mean_count = item_counts.mean()
    median_count = item_counts.median()
    try:
        mode_count = mode(item_counts)
    except:
        mode_count = 'No unique mode'

    max_apt_group = item_counts.idxmax()
    max_count = item_counts[max_apt_group]

    min_apt_group = item_counts.idxmin()
    min_count = item_counts[min_apt_group]

    stats = {
        'mean_count': mean_count,
        'median_count': median_count,
        'mode_count': mode_count,
        'max_count': max_count,
        'max_apt_group': max_apt_group,
        'min_count': min_count,
        'min_apt_group': min_apt_group
    }

    return apt_map, merged_df['source ID'].nunique(), merged_df['target ID'].nunique(), stats


In [4]:
# Example call to the function for techniques mapping
techniques_map, unique_groups_technique, unique_techniques, stats_technique = apt_group_mapping_from_config(
    groups_file_paths, mapping_type='techniques'
)

# Example call to the function for software mapping
software_map, unique_groups_software, unique_software, stats_software = apt_group_mapping_from_config(
    groups_file_paths, mapping_type='software'
)


In [6]:
stats_technique, stats_software, unique_groups_technique, unique_groups_software

({'mean_count': np.float64(23.195945945945947),
  'median_count': np.float64(14.5),
  'mode_count': 9,
  'max_count': np.int64(92),
  'max_apt_group': 'G0032',
  'min_count': np.int64(1),
  'min_apt_group': 'G0002'},
 {'mean_count': np.float64(6.753623188405797),
  'median_count': np.float64(4.0),
  'mode_count': 2,
  'max_count': np.int64(48),
  'max_apt_group': 'G0016',
  'min_count': np.int64(1),
  'min_apt_group': 'G0002'},
 148,
 138)

In [7]:
def get_top_items_from_map(data_map: dict, top_n: int = 10) -> pd.DataFrame:
    """
    Get top N most common items (software or techniques) from a pre-built APT map.

    Parameters
    ----------
    data_map : dict
        Mapping from group ID -> {'name': group name, 'items': list of software or techniques}.
    top_n : int
        Number of top items to return.

    Returns
    -------
    pd.DataFrame
        DataFrame with item ID, count, and optional placeholder name.
    """
    # Flatten all item lists across all groups
    all_items = [item for group in data_map.values() for item in group['items']]

    # Count frequencies
    item_counts = Counter(all_items).most_common(top_n)

    # Build DataFrame
    df = pd.DataFrame(item_counts, columns=['item ID', 'count'])

    return df

In [8]:
# Get top techniques
top_techniques_df = get_top_items_from_map(techniques_map, top_n=10)

# Get top software
top_software_df = get_top_items_from_map(software_map, top_n=10)


In [10]:
top_techniques_df

Unnamed: 0,item ID,count
0,T1204.002,79
1,T1105,76
2,T1566.001,72
3,T1059.001,69
4,T1588.002,66
5,T1059.003,60
6,T1036.005,50
7,T1547.001,50
8,T1071.001,47
9,T1082,46


In [26]:
def load_cves_from_json(json_file):
    """
    Loads the CVE data from a JSON file and returns a dictionary of group IDs to associated CVEs.
    
    Parameters
    ----------
    json_file : str
        Path to the JSON file containing APT group CVE data.
    
    Returns
    -------
    dict
        A dictionary with group IDs as keys and a list of CVEs as values.
    """
    with open(json_file, 'r') as file:
        data = json.load(file)

    cve_map = {}
    for group_id, details in data.items():
        # Flatten the CVEs associated with all hashes for each group
        cves = set()
        for item in details.get('hashes', []):
            cves.update(item.get('cves', []))
        cve_map[group_id] = cves
    
    return cve_map

In [27]:
cve_map = load_cves_from_json(r"../Malpedia Bib files Analysis/MITRE_cve_group_analysis.json")

In [34]:
def compare_apt_groups_detailed(techniques_map, software_map, cve_map, group_ids):
    """
    Compare APT groups based on their techniques, software, and CVEs.
    Provides detailed differences including common, unique, and union, as well as the Jaccard Index.
    
    Parameters:
    ----------
    techniques_map : dict
        A mapping of APT group IDs to techniques used.
    software_map : dict
        A mapping of APT group IDs to software used.
    cve_map : dict
        A mapping of APT group IDs to CVEs.
    group_ids : list
        A list of APT group IDs to compare.

    Returns:
    -------
    dict
        A dictionary containing:
        - common_techniques: Common techniques used by all the selected groups.
        - common_software: Common software used by all the selected groups.
        - common_cves: Common CVEs between all the selected groups.
        - unique_techniques: Unique techniques for each group.
        - unique_software: Unique software for each group.
        - unique_cves: Unique CVEs for each group.
        - union_techniques: Union of all techniques used by the selected groups.
        - union_software: Union of all software used by the selected groups.
        - union_cves: Union of all CVEs used by the selected groups.
        - jaccard_index_techniques: Jaccard Index for techniques.
        - jaccard_index_software: Jaccard Index for software.
        - jaccard_index_cves: Jaccard Index for CVEs.
    """
    common_techniques = None
    common_software = None
    common_cves = None
    unique_techniques = {}
    unique_software = {}
    unique_cves = {}

    # To keep track of all techniques, software, and CVEs for union and jaccard index
    all_techniques_sets = []
    all_software_sets = []
    all_cves_sets = []

    # Loop over the selected group IDs
    for group_id in group_ids:
        if group_id not in techniques_map or group_id not in software_map:
            raise ValueError(f"Group ID {group_id} not found in techniques_map or software_map")

        # Get the techniques and software for the current group
        group_techniques = set(techniques_map.get(group_id, {}).get('items', []))
        group_software = set(software_map.get(group_id, {}).get('items', []))

        # Handle cases where a group might not have CVEs
        group_cves = cve_map.get(group_id, set())  # Default to an empty set if not found

        # Update common_techniques, common_software, and common_cves by intersecting with the first group
        if common_techniques is None:
            common_techniques = group_techniques
        else:
            common_techniques &= group_techniques

        if common_software is None:
            common_software = group_software
        else:
            common_software &= group_software

        if common_cves is None:
            common_cves = group_cves
        else:
            common_cves &= group_cves

        # Collect unique techniques, software, and CVEs for the current group
        unique_techniques[group_id] = group_techniques - common_techniques
        unique_software[group_id] = group_software - common_software
        unique_cves[group_id] = group_cves - common_cves

        # Collect sets of techniques, software, and CVEs for Jaccard index and union
        all_techniques_sets.append(group_techniques)
        all_software_sets.append(group_software)
        all_cves_sets.append(group_cves)

    # Union of all techniques, software, and CVEs
    union_techniques = set.union(*all_techniques_sets)
    union_software = set.union(*all_software_sets)
    union_cves = set.union(*all_cves_sets)

    # Jaccard Index for techniques, software, and CVEs
    jaccard_index_techniques = len(common_techniques) / len(union_techniques) if len(union_techniques) > 0 else 0
    jaccard_index_software = len(common_software) / len(union_software) if len(union_software) > 0 else 0
    jaccard_index_cves = len(common_cves) / len(union_cves) if len(union_cves) > 0 else 0

    # Prepare the result
    result = {
        'common_techniques': list(common_techniques),
        'common_software': list(common_software),
        'common_cves': list(common_cves),
        'unique_techniques': unique_techniques,
        'unique_software': unique_software,
        'unique_cves': unique_cves,
        'union_techniques': list(union_techniques),
        'union_software': list(union_software),
        'union_cves': list(union_cves),
        'jaccard_index_techniques': jaccard_index_techniques,
        'jaccard_index_software': jaccard_index_software,
        'jaccard_index_cves': jaccard_index_cves
    }

    return result


In [35]:
malpedia_mitre_map = {
    'apt17': ['G0025', 'G0001'],
    'apt19': ['G0073', 'G0009'],
    'apt30': ['G0013', 'G0030'],
    'lazarus': ['G0082', 'G0138', 'G0032'],
    'apt41': ['G0096', 'G0044'],
    'earth lusca': ['G0143', 'G1006'],
    'fin7': ['G0008', 'G0046'],
    'dragonok': ['G0017', 'G0002'],
    'mustang panda': ['G1014', 'G0129']
}

In [36]:
for actor, group_ids in malpedia_mitre_map.items():
    try:
        result = compare_apt_groups_detailed(techniques_map, software_map, cve_map, group_ids)

        print(f"Comparison for {actor} ({', '.join(group_ids)})")
        print(f"  - Common Techniques: {len(result['common_techniques'])}")
        print(f"  - Common Software: {len(result['common_software'])}")
        print(f"  - Common CVEs: {len(result['common_cves'])}")
        print(f"  - Jaccard (Techniques): {result['jaccard_index_techniques']:.2f}")
        print(f"  - Jaccard (Software): {result['jaccard_index_software']:.2f}")
        print(f"  - Jaccard (CVEs): {result['jaccard_index_cves']:.2f}")

    except ValueError as e:
        print(f"Skipping {actor} due to error: {e}")

Comparison for apt17 (G0025, G0001)
  - Common Techniques: 0
  - Common Software: 0
  - Common CVEs: 0
  - Jaccard (Techniques): 0.00
  - Jaccard (Software): 0.00
  - Jaccard (CVEs): 0.00
Comparison for apt19 (G0073, G0009)
  - Common Techniques: 3
  - Common Software: 0
  - Common CVEs: 0
  - Jaccard (Techniques): 0.30
  - Jaccard (Software): 0.00
  - Jaccard (CVEs): 0.00
Skipping apt30 due to error: Group ID G0030 not found in techniques_map or software_map
Comparison for lazarus (G0082, G0138, G0032)
  - Common Techniques: 7
  - Common Software: 0
  - Common CVEs: 0
  - Jaccard (Techniques): 0.07
  - Jaccard (Software): 0.00
  - Jaccard (CVEs): 0.00
Comparison for apt41 (G0096, G0044)
  - Common Techniques: 4
  - Common Software: 1
  - Common CVEs: 0
  - Jaccard (Techniques): 0.67
  - Jaccard (Software): 0.33
  - Jaccard (CVEs): 0.00
Comparison for earth lusca (G0143, G1006)
  - Common Techniques: 7
  - Common Software: 1
  - Common CVEs: 0
  - Jaccard (Techniques): 0.16
  - Jaccard

In [17]:
software_map.get('G0025')

{'name': 'APT17', 'items': ['S0069'], 'type': {'Malware': 1}}