In [34]:
import pandas as pd
import json
import os
from statistics import mode
from collections import Counter, defaultdict
import requests
import re

In [25]:
def load_json_metadata(metadata_path):
    """
    Loads the group metadata JSON file.

    Parameters
    ----------
    metadata_path : str
        Path to the JSON file containing group metadata.

    Returns
    -------
    dict
        Group metadata loaded from the file.
    """
    with open(metadata_path, 'r', encoding='utf-8') as f:
        metadata = json.load(f)
    return metadata

In [26]:
# Path to group_metadata.json
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))  # move up from notebooks/
file_malpedia_family_actor = os.path.join(project_root, "malpedia_api_responses", "family_actor_info.json")


# Load the file
malpedia_software_metadata = load_json_metadata(file_malpedia_family_actor)


In [27]:
def build_software_to_actor_map(family_data_dict):
    """
    Builds a mapping from software/family name to its associated actors and alternate names.

    Parameters
    ----------
    family_data_dict : dict
        Dictionary where keys are software names and values are metadata (alt_names, actors).

    Returns
    -------
    dict
        Mapping from software name to dictionary with 'alt_names' and 'actors'.
    """
    software_to_actor_map = {}

    for software_name, info in family_data_dict.items():
        alt_names = info.get("alt_names", [])
        actors_block = info.get("actors", [])

        actor_names = [actor.get("actor_name", "") for actor in actors_block if actor.get("actor_name")]

        software_to_actor_map[software_name] = {
            "alt_names": alt_names,
            "actors": actor_names
        }

    return software_to_actor_map


In [28]:
def build_actor_to_software_map(software_to_actor_map):
    """
    Builds a reverse mapping from actor name to a list of software/families used.

    Parameters
    ----------
    software_to_actor_map : dict
        Dictionary mapping software to their associated actors.

    Returns
    -------
    dict
        Mapping from actor name to list of software names.
    """
    actor_to_software_map = {}

    for software, details in software_to_actor_map.items():
        for actor in details.get("actors", []):
            actor_to_software_map.setdefault(actor, []).append(software)

    return actor_to_software_map


In [30]:
# Step 1: Build software-to-actor map first
software_to_actor_map = build_software_to_actor_map(malpedia_software_metadata)

# Step 2: Generate the reverse mapping
actor_to_software_map = build_actor_to_software_map(software_to_actor_map)

# Step 3: Preview top actors and their tools
# from pprint import pprint
# pprint(dict(list(actor_to_software_map.items())[:5]))

print(len(actor_to_software_map))

247


In [39]:
def analyze_software_usage(actor_to_software_map, top_n=10):
    """
    Analyzes software usage across actors.

    Parameters
    ----------
    actor_to_software_map : dict
        Mapping from actor names to list of software used.
    top_n : int
        Number of top common software to return.

    Returns
    -------
    dict
        Dictionary containing:
            - software_frequency: Counter of software usage across actors
            - top_common_software: List of top N software and their usage counts
            - unique_software_per_actor: Mapping of actor to list of software unique to them
    """
    software_frequency = Counter()
    software_to_actors = defaultdict(set)

    # Count software appearances and map software to actors
    for actor, software_list in actor_to_software_map.items():
        for software in software_list:
            software_frequency[software] += 1
            software_to_actors[software].add(actor)

    # Identify unique software per actor
    unique_software_per_actor = defaultdict(list)
    for software, actors in software_to_actors.items():
        if len(actors) == 1:
            only_actor = list(actors)[0]
            unique_software_per_actor[only_actor].append(software)

    return {
        "software_frequency": software_frequency,
        "top_common_software": software_frequency.most_common(top_n),
        "unique_software_per_actor": unique_software_per_actor
    }


In [40]:
results = analyze_software_usage(actor_to_software_map)

unique_software_map = results['unique_software_per_actor']
num_actors_with_unique_software = sum(1 for softwares in unique_software_map.values() if softwares)

print(f"Number of actors with at least one unique software: {num_actors_with_unique_software}")

print("Top 10 Most Common Software Across Groups:")
for sw, count in results['top_common_software']:
    print(f"{sw}: used by {count} groups")

print("\n Example Unique Software Per Actor:")
for actor, unique_sw in list(results['unique_software_per_actor'].items())[:15]:
    print(f"{actor}: {unique_sw}")


Number of actors with at least one unique software: 208
Top 10 Most Common Software Across Groups:
win.plugx: used by 19 groups
win.cobalt_strike: used by 18 groups
win.poison_ivy: used by 8 groups
win.8t_dropper: used by 8 groups
win.shadowpad: used by 7 groups
win.ghost_rat: used by 6 groups
win.chinachopper: used by 6 groups
win.quasar_rat: used by 4 groups
win.njrat: used by 4 groups
win.derusbi: used by 4 groups

 Example Unique Software Per Actor:
Lazarus Group: ['aix.fastcash', 'apk.badcall', 'apk.hardrain', 'elf.badcall', 'elf.simpletea', 'elf.spectral_blur', 'js.quickcafe', 'osx.3cx_backdoor', 'osx.applejeus', 'osx.casso', 'osx.dacls', 'osx.hloader', 'osx.interception', 'osx.kandykorn', 'osx.manuscrypt', 'osx.poolrat', 'osx.rustbucket', 'osx.simpletea', 'osx.spectral_blur', 'osx.sugarloader', 'osx.unidentified_001', 'osx.watchcat', 'osx.yort', 'php.redhat_hacker', 'ps1.powerbrace', 'ps1.powerspritz', 'win.3cx_backdoor', 'win.alphanc', 'win.alreay', 'win.anchormtea', 'win.apple

In [42]:
# Define target folder and file name
output_dir = "output"
output_file = "malpedia_actor_to_software_map.json"
output_path = os.path.join(output_dir, output_file)

# Create the folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(actor_to_software_map, f, indent=2, ensure_ascii=False)

print(f"actor_to_software_map has been saved to: {output_file}")


actor_to_software_map has been saved to: malpedia_actor_to_software_map.json
