In [26]:
### Malpedia API interaction

In [33]:
import requests
import json

# Malpedia API base URL
base_url = "https://malpedia.caad.fkie.fraunhofer.de/api"

# Your Malpedia API key
api_key = ""  # Replace with your actual API key

# Headers with authentication
headers = {
    "Authorization": f"Bearer {api_key}"
}
# Function to get details of actors from Malpedia
def get_actors():
    endpoint = f"{base_url}/get/actors"
    response = requests.get(endpoint, headers=headers)
    
    # Check if the request was successful
    if response.status_code == 200:
        actors_data = response.json()  # Parse JSON response
        
        # Store the actors data in a JSON file
        with open("actors_data.json", "w") as json_file:
            json.dump(actors_data, json_file, indent=4)  # Save JSON to file with indentation for readability
        
        print("Actors data has been stored in 'actors_data.json'.")
        return actors_data
    else:
        print(f"Failed to retrieve actors: {response.status_code}")
        return None

# Call the function to get actors and store them in a file
#actors = get_actors()

Actors data has been stored in 'actors_data.json'.


In [2]:
import json

# Function to read the actors data from the JSON file
def read_actors_from_file(filename="actors_data.json"):
    try:
        with open(filename, "r") as json_file:
            actors_data = json.load(json_file)  # Load the JSON data from the file
            return actors_data
    except FileNotFoundError:
        print(f"File {filename} not found.")
        return None
    except json.JSONDecodeError:
        print("Error decoding the JSON data.")
        return None

# Call the function to read the actors from the JSON file
actors_data = read_actors_from_file()

In [9]:
from collections import defaultdict

# Function to get unique references along with the associated actor name, and count references before and after filtering
def get_unique_references_and_counts():
    actors_data = read_actors_from_file()
    if not actors_data:
        return None, None, None, 0, 0

    reference_count = defaultdict(list)  # Dictionary to store actors associated with each reference
    unique_references = []  # List to store unique references
    actor_reference_count_before = defaultdict(int)  # Count of references per actor before filtering
    actor_reference_count_after = defaultdict(int)   # Count of references per actor after filtering

    total_references_before = 0  # Total reference count before filtering
    total_references_after = 0   # Total reference count after filtering

    # Loop through actors and collect references
    for actor_uuid, actor in actors_data.items():
        actor_name = actor.get("value", "N/A")  # Use 'value' as the actor's name
        references = actor.get("meta", {}).get("refs", [])
        
        # Count references for each actor before filtering
        actor_reference_count_before[actor_name] += len(references)
        total_references_before += len(references)  # Increment total reference count before filtering

        # Store actor name with each reference
        for ref in references:
            reference_count[ref].append(actor_name)

    # Now filter to keep only references that appear with a single actor
    for ref, associated_actors in reference_count.items():
        if len(associated_actors) == 1:  # Check if the reference is associated with only one actor
            unique_references.append((ref, associated_actors[0]))  # Store the reference with the actor name
            actor_reference_count_after[associated_actors[0]] += 1  # Increment the count for unique references
            total_references_after += 1  # Increment total reference count after filtering

    return unique_references, actor_reference_count_before, actor_reference_count_after, total_references_before, total_references_after

# Extract unique references along with the actor name and the reference counts
unique_references, reference_count_before, reference_count_after, total_before, total_after = get_unique_references_and_counts()

# Print summary of reference counts
print(f"\nTotal reference count before filtering: {total_before}")
print(f"Total reference count after filtering unique references: {total_after}")

# Identify and print actors with no unique references after filtering
no_unique_references_count = 0  # Count of actors without any unique references

print("\nActors without any unique references or URLs:")
for actor_name, count_before in reference_count_before.items():
    count_after = reference_count_after.get(actor_name, 0)  # Get the count after filtering (default 0 if not found)
    
    if count_after == 0:
        print(f"Actor: {actor_name} (References before: {count_before}, References after: {count_after})")
        no_unique_references_count += 1

# Print the count of actors without any unique references or URLs
print(f"\nTotal number of threat actors without any unique references or URLs: {no_unique_references_count}")

# # Print some example unique references with actor names
# print("\nExample of unique references with associated actor names:")
# for ref, actor_name in unique_references[:5]:  # Limiting to 5 for display
#     print(f"Reference: {ref} -> Actor: {actor_name}")



Total reference count before filtering: 2852
Total reference count after filtering unique references: 2443

Actors without any unique references or URLs:
Actor: ANDROMEDA SPIDER (References before: 1, References after: 0)
Actor: ArcaneDoor (References before: 1, References after: 0)
Actor: Aslan Neferler Tim (References before: 0, References after: 0)
Actor: Ayyıldız Tim (References before: 0, References after: 0)
Actor: BadRory (References before: 1, References after: 0)
Actor: Blackatom (References before: 1, References after: 0)
Actor: Boulder Bear (References before: 0, References after: 0)
Actor: BRONZE EDGEWOOD (References before: 2, References after: 0)
Actor: Cadelle (References before: 1, References after: 0)
Actor: CardinalLizard (References before: 1, References after: 0)
Actor: Cyber Army of Russia Reborn (References before: 0, References after: 0)
Actor: Danti (References before: 1, References after: 0)
Actor: DarkCasino (References before: 1, References after: 0)
Actor: 

In [10]:
import requests
import json
# Base URL for Malpedia API
base_url = "https://malpedia.caad.fkie.fraunhofer.de/api"

# Replace with your actual API key
api_key = "5d9f217ae97ab6bf82ba7e32c863a772717f74c0"

# Set up headers for authentication
headers = {
    "Authorization": f"Bearer {api_key}"
}

# Function to retrieve malware families
def get_malware_families():
    try:
        # Endpoint to get malware families
        endpoint = f"{base_url}/get/families"
        response = requests.get(endpoint, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            families_data = response.json()  # Parse JSON response
            
            # Save the data to a JSON file
            with open('malware_families.json', 'w') as json_file:
                json.dump(families_data, json_file, indent=4)
            
            print("Malware families data has been saved to 'malware_families.json'.")
            return families_data
        else:
            print(f"Failed to retrieve families. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching families: {e}")
        return None


# Function to retrieve and save the BibTeX references for a specific actor
def fetch_actor_bibtex(actor_id):
    try:
        # Endpoint to get BibTeX references for the specified actor
        endpoint = f"{base_url}/get/bib/actor/{actor_id}"
        response = requests.get(endpoint, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            bib_data = response.text  # Get the BibTeX data as plain text
            
            # Save the .bib data to a file
            filename = f"{actor_id}.bib"
            with open(filename, 'w') as file:
                file.write(bib_data)
            
            print(f"BibTeX references for actor '{actor_id}' saved to {filename}.")
        else:
            print(f"Failed to retrieve BibTeX for actor {actor_id}. Status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching BibTeX for actor {actor_id}: {e}")


In [11]:
# Call the function to retrieve and save malware families
#malware_families = get_malware_families()

In [12]:
# Function to read malware families from the JSON file
def read_malware_families_from_file():
    try:
        with open('malware_families.json', 'r') as json_file:
            families_data = json.load(json_file)
            return families_data
    except FileNotFoundError:
        print("Malware families JSON file not found.")
        return None

In [13]:
import re

# Function to clean and normalize actor names (remove spaces, ignore "group")
def normalize_actor_name(name):
    # Remove spaces and convert to lowercase
    normalized_name = re.sub(r"\s+", "", name.lower())
    
    # Remove common terms like 'group' (you can extend this list as needed)
    normalized_name = re.sub(r"\bgroup\b", "", normalized_name)
    
    return normalized_name

# Function to retrieve actor details by name or check in synonyms (case-insensitive and normalized)
def get_actor_by_name_or_synonym(actor_name):
    actors_data = read_actors_from_file()  # Load actor data from JSON file
    
    # Normalize the input actor name (remove spaces, ignore common terms like 'group')
    actor_name_normalized = normalize_actor_name(actor_name)
    
    # Step 1: Try to find the actor directly by name (normalized and case-insensitive)
    for key, actor in actors_data.items():
        actor_main_name_normalized = normalize_actor_name(actor.get("value", ""))
        if actor_main_name_normalized == actor_name_normalized:
            return actor  # Found the actor
    
    # Step 2: If not found by name, search through the synonyms (normalized and case-insensitive)
    for actor in actors_data.values():
        synonyms = actor.get("meta", {}).get("synonyms", [])
        for synonym in synonyms:
            if normalize_actor_name(synonym) == actor_name_normalized:
                return actor  # Found the actor by synonym

    # If no match is found, return None and log a warning
    print(f"Actor '{actor_name}' not found in JSON file or synonyms.")
    return None


In [15]:
import json

# Main function to extract attribution, retrieve actor details, and store them in a dictionary
def extract_actor_info_from_malware():
    # Step 1: Get malware families data
    malware_families = read_malware_families_from_file()
    
    if not malware_families:
        return None
    
    malware_actor_info = {}  # Dictionary to store the information
    
    # Step 2: Loop through each malware family and extract attributions
    for family_name, family_data in malware_families.items():
        family_info = {
            "alt_names": family_data.get("alt_names", []),
            "urls": family_data.get("urls", []),
            "actors": []
        }

        attributions = family_data.get("attribution", [])
        
        # Step 3: For each attribution, retrieve the actor details
        for actor_name in attributions:
            actor_data = get_actor_by_name_or_synonym(actor_name)
            
            if not actor_data:
                continue

            # Step 4: Extract synonyms and references from the actor data
            meta = actor_data.get("meta", {})
            actor_info = {
                "actor_name": actor_name,
                "synonyms": meta.get("synonyms", []),
                "actor_references": meta.get("refs", [])
            }
            
            family_info["actors"].append(actor_info)  # Store actor information under the malware family
        
        # Add the family info to the main dictionary
        malware_actor_info[family_name] = family_info
    
    return malware_actor_info

# Example usage of storing the extracted data
malware_actor_info = extract_actor_info_from_malware()

# #Output the data into a JSON file
# if malware_actor_info:
#     with open('family_actor_info.json', 'w') as json_file:
#         json.dump(malware_actor_info, json_file, indent=4)

#     print("Malware and actor information stored successfully in 'family_actor_info.json'.")


Actor 'Sphinx (APT-C-15)' not found in JSON file or synonyms.
Actor 'Stealth Mango' not found in JSON file or synonyms.
Actor 'cron' not found in JSON file or synonyms.
Actor 'Winnti Umbrella' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'APOTHECARY SPIDER' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Oktropys' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Anunak' not found in JSON file or synonyms.
Actor 'Calypso group' not found in JSON file

In [18]:
import json
from collections import defaultdict

# Function to read the stored malware actor info from the JSON file
def read_malware_actor_info(file_path):
    try:
        with open(file_path, 'r') as file:
            malware_actor_info = json.load(file)
        return malware_actor_info
    except FileNotFoundError:
        print(f"File {file_path} not found.")
        return None

# Perform statistical analysis on malware_actor_info data
def analyze_malware_actor_info(file_path):
    # Step 1: Read the data
    malware_actor_info = read_malware_actor_info(file_path)
    
    if not malware_actor_info:
        return

    # Initialize counters and storage
    total_malware_families = 0
    families_with_attribution = 0
    families_with_urls = 0
    references_per_family = {}
    total_references_count = 0
    reference_occurrences = defaultdict(int)  # Dictionary to track how many times each reference appears
    unique_actors = set()  # Set to track unique actors

    # Step 2: Analyze each malware family
    for family_name, family_data in malware_actor_info.items():
        total_malware_families += 1
        
        # Check if there is attribution data
        if family_data.get('actors'):
            families_with_attribution += 1
            # Add actors to the unique actors set
            actors = family_data.get('actors', [])
            for actor in actors:
                actor_name = actor.get("actor_name")
                if actor_name:
                    unique_actors.add(actor_name)
        
        # Check if there are URLs (references)
        urls = family_data.get('urls', [])
        if urls:
            families_with_urls += 1
            references_per_family[family_name] = len(urls)  # Store the count of references for each family
            total_references_count += len(urls)  # Increment the total references count

            # Track how many times each reference appears
            for url in urls:
                reference_occurrences[url] += 1

    # Step 3: Count unique references (references that appear only once)
    unique_references = [url for url, count in reference_occurrences.items() if count == 1]
    unique_references_count = len(unique_references)

    # Step 4: Display the results
    print(f"Total malware families: {total_malware_families}")
    print(f"Malware families with attribution: {families_with_attribution}")
    print(f"Malware families with references (URLs): {families_with_urls}")
    print(f"Total references (URLs) across all families: {total_references_count}")
    print(f"Unique references (URLs) appearing only in one family: {unique_references_count}")
    print(f"Unique actors: {', '.join(unique_actors)}")
    print("Total unique actors:", len(unique_actors))
    

# Specify the file path for the stored JSON data
file_path = "family_actor_info.json"

# Run the analysis
analyze_malware_actor_info(file_path)

Total malware families: 3367
Malware families with attribution: 963
Malware families with references (URLs): 3234
Total references (URLs) across all families: 20073
Unique references (URLs) appearing only in one family: 9810
Unique actors: Lazarus Group, 1937CN, Water Barghest, Chafer, PLATINUM, GreyEnergy, SNOWGLOBE, APT37,  Scarred Manticore, AQUATIC PANDA, Dust Storm, UAC-0006, WindShift, UNC1860, MageCart, RANCOR, APT-C-27, Infy, Lotus Blossom, CeranaKeeper, Silent Chollima, Water Orthrus, IXESHE, APT27, Leafminer, Sea Turtle, Hellsing, Groundbait, Volt Typhoon, RIDDLE SPIDER, Shell Crew, Domestic Kitten, Winter Vivern, Unit 8200, The Gorgon Group, Cobalt, Mirage, Blackwood, SloppyLemming, TA410, Rebel Jackal, SCULLY SPIDER, Skeleton Spider, Gelsemium, SMOKY SPIDER,  Silent Chollima, Xcatze, Boss Spider, Pacha Group, NetTraveler, MuddyWater, PROMETHIUM, Molerats, Charming Kitten, MirrorFace, TA578, MUMMY SPIDER, YoroTrooper, Cleaver, DarkHotel, VICEROY TIGER, Void Arachne, Samurai 

In [21]:
# Function to find and print malware families without URLs
def print_families_without_urls(malware_actor_info, max_examples=10):
    print("\nMalware families without URLs:")
    count = 0
    for family_name, family_data in malware_actor_info.items():
        urls = family_data.get('urls', [])  # Check if URLs are present
        if not urls:  # If the URLs list is empty
            print(f"- {family_name}")
            count += 1
            if count == max_examples:  # Limit the number of examples printed
                break
    print(f"\nTotal malware families without URLs: {count}")

# Path to the stored JSON data
file_path = "family_actor_info.json"

# Load data and perform analysis
malware_actor_info = read_malware_actor_info(file_path)

# Print a few malware families without URLs (default 5 examples)
print_families_without_urls(malware_actor_info, max_examples=10)



Malware families without URLs:
- apk.pornhub
- apk.smsspy
- apk.triout
- apk.unidentified_002
- apk.unidentified_005
- asp.unidentified_001
- elf.corona
- elf.dofloo
- js.unidentified_js_002
- win.abantes

Total malware families without URLs: 10
