In [7]:
import os
import json

# Load the config file (if you have a config.json file)
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

In [8]:
# Function to read JSONL file
def read_jsonl(file_path):
    """
    Reads a JSONL file and returns a list of JSON objects.

    Parameters
    ----------
    file_path : str
        The path to the JSONL file.

    Returns
    -------
    list
        A list of JSON objects.
    """
    data = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    
    return data

# File paths for the JSONL files
malpedia_2022_file = config['jsonl_files']['Malpedia_2022']
malpedia_2024_file = config['jsonl_files']['Malpedia_2024']

# Ensure the file paths are relative to the running script's directory
malpedia_2022_path = os.path.join(os.getcwd(), malpedia_2022_file)
malpedia_2024_path = os.path.join(os.getcwd(), malpedia_2024_file)

# Read the files
malpedia_2022_data = read_jsonl(malpedia_2022_path)
malpedia_2024_data = read_jsonl(malpedia_2024_path)

# Concatenate the data from both JSONL files
combined_data = malpedia_2022_data + malpedia_2024_data

# Example: print the length of the combined data and the first entry
print(f"Total number of entries: {len(combined_data)}")
print("First entry from the combined data:")
print(combined_data[0])

Total number of entries: 15765
First entry from the combined data:
{'author': 'CERT Division', 'date': '2000', 'download_mime': 'text/html', 'download_redirects': ['https://resources.sei.cmu.edu/library/asset-view.cfm?assetID=496186'], 'download_sha256': 'f2c405b383ebaf4d0793f8d5162841b953d06947a711f7d34242faa20e285a04', 'download_size': 41745, 'download_status': 200, 'download_ts': '2022-07-19 12:43:41.400938+00:00', 'language': 'English', 'organization': 'Carnegie Mellon University', 'origin': ['malpedia:CarnegieMellonUniversity'], 'title': '2000 CERT Advisories', 'url': 'https://resources.sei.cmu.edu/library/asset-view.cfm?assetID=496186'}


In [9]:
# Function to extract file hashes from filenames
def get_file_hashes(directory):
    """
    Extracts file hashes from filenames in the given directory.

    Parameters
    ----------
    directory : str
        The path to the directory containing the files.

    Returns
    -------
    list
        A list of file hashes extracted from the filenames.
    """
    file_hashes = []
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith(".iocs"):
                # Extract the hash from the filename (everything before '.iocs')
                file_hash = filename.split('.')[0]
                file_hashes.append(file_hash)
    return file_hashes

In [10]:
# Function to map file hash to url
def create_hash_to_url_map(file_hashes, combined_data):
    """
    Maps file hashes to URLs based on the download_sha256 field in the combined_data.

    Parameters
    ----------
    file_hashes : list
        A list of file hashes.
    combined_data : list
        A list of dictionaries (combined JSONL data) with download_sha256 and url fields.

    Returns
    -------
    dict
        A dictionary mapping file hashes to URLs.
    """
    hash_to_url = {}
    for file_hash in file_hashes:
        for entry in combined_data:
            if entry.get("download_sha256") == file_hash:
                hash_to_url[file_hash] = entry.get("url")
                break  # Stop searching once we find the match for the hash
    return hash_to_url

In [11]:
# Get directories from the config
directories = [
    config["directory_paths"]["CVE_Malpedia_2022"],
    config["directory_paths"]["CVE_Malpedia_2024"]
]

# Extract file hashes from the directory
file_hashes = get_file_hashes(directories)

# Create the mapping of file hashes to URLs
hash_to_url_map = create_hash_to_url_map(file_hashes, combined_data)

first_key, first_value = next(iter(hash_to_url_map.items()))
print(first_key, first_value)

000c1ee0c1bab222569623c47397f634b77c4124ad8bb0b0d2533ec98fcc6a16 https://attack.mitre.org/groups/G0046/


In [12]:
##Common file hashes found: {'bd90e5d64d43cd326049d739d519c270d9f2856db6c1d140569f152b0fa3b757', 
##'acd626acf50af8e30a681ccf88662b2bcecd5ec6053c18d6b460a42d9d726764', 
##'a71555ff127721ad3f47e0427411dde35ec792889c2778ba43571d3a4b3f5cca'}
unique_hashes = len(hash_to_url_map)
print(f"Total unique file hashes: {unique_hashes}")

# Check if the URLs are unique
urls = list(hash_to_url_map.values())
unique_urls = len(set(urls))

print(f"Total URLs: {len(urls)}")
print(f"Total unique URLs: {unique_urls}")

# Check if there are any duplicate URLs
if len(urls) == unique_urls:
    print("All URLs are unique.")
else:
    print(f"There are {len(urls) - unique_urls} duplicate URLs.")

    # Find and print duplicate URLs
    url_counts = {}
    
    # Count the occurrences of each URL
    for url in urls:
        if url in url_counts:
            url_counts[url] += 1
        else:
            url_counts[url] = 1

    # Filter out the duplicate URLs (those that appear more than once)
    duplicate_urls = {url: count for url, count in url_counts.items() if count > 1}

    print("Duplicate URLs and their occurrences:")
    for url, count in duplicate_urls.items():
        print(f"URL: {url} -> Occurrences: {count}")

    # # Find the hashes associated with the duplicate URLs
    # print("\nHashes associated with duplicate URLs:")
    # for hash_key, url in hash_to_url_map.items():
    #     if url in duplicate_urls:
    #         print(f"Hash: {hash_key} -> URL: {url}")

Total unique file hashes: 2832
Total URLs: 2832
Total unique URLs: 2811
There are 21 duplicate URLs.
Duplicate URLs and their occurrences:
URL: https://www.fortinet.com/blog/threat-research/new-strrat-rat-phishing-campaign -> Occurrences: 2
URL: https://www.systemtek.co.uk/2018/07/luoxk-malware-exploiting-cve-2018-2893/ -> Occurrences: 2
URL: https://www.cisa.gov/uscert/ncas/alerts/aa22-264a -> Occurrences: 2
URL: https://www.cisa.gov/uscert/ncas/alerts/aa22-249a -> Occurrences: 2
URL: https://www.cisa.gov/news-events/alerts/2023/07/28/cisa-releases-malware-analysis-reports-barracuda-backdoors -> Occurrences: 2
URL: https://www.cisa.gov/news-events/analysis-reports/ar23-250a-0 -> Occurrences: 2
URL: https://www.cisa.gov/uscert/ncas/alerts/aa22-277a -> Occurrences: 2
URL: https://www.cisa.gov/news-events/cybersecurity-advisories/aa23-347a -> Occurrences: 2
URL: https://www.cisa.gov/uscert/ncas/alerts/aa22-279a -> Occurrences: 2
URL: https://www.trellix.com/about/newsroom/stories/researc

In [13]:
# Function to load the actors data from the actors_data.json file and malware families data from malware_families.json
def load_actors_data(file_path):
    """
    Loads the actors data from a JSON file.

    Parameters
    ----------
    file_path : str
        The path to the actors_data.json file.

    Returns
    -------
    dict
        A dictionary containing the actors data.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        actors_data = json.load(f)
    return actors_data


def load_families_data(file_path):
    """
    Loads the malware families data from a JSON file.

    Parameters
    ----------
    file_path : str
        The path to the actors_data.json file.

    Returns
    -------
    dict
        A dictionary containing the families data.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        families_data = json.load(f)
    return families_data

In [15]:
# Function to create a URL-to-group mapping
def create_url_to_group_map(actors_data, families_data):
    """
    Creates a mapping from URLs to group names based on the actors data.
    If a URL is not found in the actors data, it checks in the families data.

    Parameters
    ----------
    actors_data : dict
        A dictionary containing the actors data.
    families_data : dict
        A dictionary containing the families data.

    Returns
    -------
    dict
        A dictionary mapping URLs to group names.
    """
    url_to_group_map = {}

    # First, check the actors_data for URLs
    for group_name, group_info in actors_data.items():
        refs = group_info.get("meta", {}).get("refs", [])
        for url in refs:
            url_to_group_map[url] = group_name

    # If a URL is not found in actors_data, check in families_data
    for family_name, family_info in families_data.items():
        urls = family_info.get("urls", [])
        for url in urls:
            # Only add the URL if it isn't already in the map
            if url not in url_to_group_map:
                url_to_group_map[url] = family_name

    return url_to_group_map

In [16]:
# Function to update the hash-to-group map with hashes and URLs grouped by group name
def update_group_to_hash_url_map(hash_to_url_map, url_to_group_map):
    """
    Updates the hash-to-group map with group names as keys and hashes/URLs as associated values.

    Parameters
    ----------
    hash_to_url_map : dict
        A dictionary mapping file hashes to URLs.
    url_to_group_map : dict
        A dictionary mapping URLs to group names.

    Returns
    -------
    dict
        A dictionary where group names are keys, and associated values are lists of hashes and URLs.
    """
    group_to_hash_url_map = {}

    for file_hash, url in hash_to_url_map.items():
        group_name = url_to_group_map.get(url, "Unknown")  # If no group found, mark as 'Unknown'
        
        if group_name not in group_to_hash_url_map:
            group_to_hash_url_map[group_name] = {"hashes": [], "urls": []}

        group_to_hash_url_map[group_name]["hashes"].append(file_hash)
        group_to_hash_url_map[group_name]["urls"].append(url)

    return group_to_hash_url_map


In [17]:
##Load the actors data
actors_data_file = "actors_data.json"  # Update with the actual file path
actors_data = load_actors_data(actors_data_file)
families_data_file = "malware_families.json"
families_data = load_families_data(families_data_file)
# Create URL-to-group mapping
url_to_group_map = create_url_to_group_map(actors_data, families_data)


In [18]:
len(url_to_group_map)

14301

In [21]:
# Assuming `hash_to_url_map` is already created (from previous steps)
# Update the hash-to-URL map with group names
hash_to_group_map = update_group_to_hash_url_map(hash_to_url_map, url_to_group_map)

first_key, first_value = next(iter(hash_to_group_map.items()))
print(first_key, first_value)

FIN7 {'hashes': ['000c1ee0c1bab222569623c47397f634b77c4124ad8bb0b0d2533ec98fcc6a16', '33a41a05c070a7a339f8f76bb20cc7f79aa9f19359fe4298c162adea20ccde1d', '791661450f54c3e286e3140373d3df095d7b27bf052a4f1d4d8f37472f4dbb94', '7d2d812d40a62f3288883afcb14ee275716af1871f68587e12ed2485cddf9b9d', 'ebdada9350107eba5023f3fa116379b04ff1302d8ffdb62f70d1c4d6596771b0'], 'urls': ['https://attack.mitre.org/groups/G0046/', 'https://www.group-ib.com/resources/threat-research/Anunak_APT_against_financial_institutions.pdf', 'https://www.crowdstrike.com/blog/arrests-put-new-focus-on-carbon-spider-adversary-group/', 'https://media.kasperskycontenthub.com/wp-content/uploads/sites/43/2018/03/08064518/Carbanak_APT_eng.pdf', 'https://securelist.com/fin7-5-the-infamous-cybercrime-rig-fin7-continues-its-activities/90703/']}


In [22]:
len(hash_to_group_map)

1095

In [24]:
# Calculate statistics
def calculate_statistics(group_map):
    # Number of unique groups
    num_unique_groups = len(group_map)
    
    # Number of unique URLs
    unique_urls = set()
    for data in group_map.values():
        unique_urls.update(data['urls'])
    num_unique_urls = len(unique_urls)
    
    # Length of the hash-to-group map (total hashes)
    num_hashes = sum(len(data['hashes']) for data in group_map.values())
    
    # Number of hashes with 'Unknown' group
    unknown_groups_count = len(group_map.get('Unknown', {}).get('hashes', []))
    
    return num_unique_groups, num_hashes, num_unique_urls, unknown_groups_count

# Get the statistics
num_unique_groups, num_hashes, num_unique_urls, unknown_groups_count = calculate_statistics(hash_to_group_map)

# Print the statistics
print(f"Number of unique groups: {num_unique_groups}")
print(f"Length of the hash-to-group map: {num_hashes}")
print(f"Number of unique URLs: {num_unique_urls}")
print(f"Number of hashes with 'Unknown' group: {unknown_groups_count}")

Number of unique groups: 1095
Length of the hash-to-group map: 2832
Number of unique URLs: 2811
Number of hashes with 'Unknown' group: 484


In [26]:
def get_file_hashes_and_cves(directory):
    """
    Extracts file hashes from filenames and reads the content of the files to
    extract CVEs associated with each hash.

    Parameters
    ----------
    directory : str
        The path to the directory containing the files.

    Returns
    -------
    dict
        A dictionary where the key is the file hash, and the value is a list
        of CVEs associated with that hash.
    """
    hash_to_cves = {}
    for directory in directories:
        # Iterate over the files in the directory
        for filename in os.listdir(directory):
            if filename.endswith(".iocs"):
                # Extract the hash from the filename (everything before '.iocs')
                file_hash = filename.split('.')[0]
    
                # Open and read the content of the file
                cves = []
                file_path = os.path.join(directory, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    for line in file:
                        # Assuming each line starts with 'cve' followed by the CVE identifier
                        parts = line.strip().split("\t")
                        if parts[0] == "cve" and len(parts) > 1:
                            cves.append(parts[1])
    
                # Store the hash and its associated CVEs in the dictionary
                hash_to_cves[file_hash] = cves

    return hash_to_cves


In [27]:
###Read directory path from config
directories = [
    config["directory_paths"]["CVE_Malpedia_2022"],
    config["directory_paths"]["CVE_Malpedia_2024"]
]

hash_to_cves = get_file_hashes_and_cves(directories)

print(len(hash_to_cves))

first_key, first_value = next(iter(hash_to_cves.items()))
print(first_key, first_value)

2832
000c1ee0c1bab222569623c47397f634b77c4124ad8bb0b0d2533ec98fcc6a16 ['CVE-2020-1472']


In [29]:
def combine_hash_data(hash_to_cves, hash_to_group_map):
    """
    Combines the CVE and group information for each hash.

    Parameters
    ----------
    hash_to_cves : dict
        A dictionary where the key is the file hash and the value is a list of CVEs associated with that hash.
    hash_to_group_map : dict
        A dictionary where the key is the file hash and the value is a dictionary containing the URL and group name for that hash.

    Returns
    -------
    dict
        A combined dictionary where the key is the file hash, and the value is a dictionary containing
        both the CVEs and group information for that hash.
    """
    combined_data = {}

    # Iterate over the hashes in hash_to_cves
    for file_hash, cves in hash_to_cves.items():
        # Check if the hash exists in hash_to_group_map
        if file_hash in hash_to_group_map:
            # If both data exist, combine them into a new dictionary
            combined_data[file_hash] = {
                'cves': cves,
                'group_info': hash_to_group_map[file_hash]
            }
        else:
            # If no group info is found, use None or a default value for group_info
            combined_data[file_hash] = {
                'cves': cves,
                'group_info': None
            }

    # If you want to include hashes that exist in hash_to_group_map but not in hash_to_cves, 
    # you can add that logic here too.
    for file_hash, group_info in hash_to_group_map.items():
        if file_hash not in combined_data:
            # If no CVEs are found for this hash, use None or an empty list for CVEs
            combined_data[file_hash] = {
                'cves': None,
                'group_info': group_info
            }

    return combined_data


In [35]:
# Assuming you have hash_to_cves and hash_to_group_map already populated

combined_data = combine_hash_data(hash_to_cves, hash_to_group_map)

first_key, first_value = next(iter(combined_data.items()))
print(first_key, first_value)

000c1ee0c1bab222569623c47397f634b77c4124ad8bb0b0d2533ec98fcc6a16 {'cves': ['CVE-2020-1472'], 'group_info': None}


In [43]:
# Function to combine CVE and group information for each hash with Group ID as the main key
def combine_cve_group_info_by_group_id(hash_to_group_map, hash_to_cves):
    combined_info_by_group = {}

    for group, group_data in hash_to_group_map.items():
        combined_info_by_group[group] = []
        for i, hash_value in enumerate(group_data['hashes']):
            cves = hash_to_cves.get(hash_value, [])  # Get CVEs for the current hash
            url = group_data['urls'][i] if i < len(group_data['urls']) else None  # Get corresponding URL

            # Append the group info with hashes, CVEs, and URLs to the group entry
            combined_info_by_group[group].append({
                'hash': hash_value,
                'cves': cves,
                'url': url
            })


    return combined_info_by_group

# Combine the CVE and group information with Group ID as the main key
combined_cve_group_info_by_group_id = combine_cve_group_info_by_group_id(hash_to_group_map, hash_to_cves)

In [44]:
# Assuming combined_data is already populated
#group_to_data = group_data_by_group_id(combined_data)


first_key, first_value = next(iter(combined_cve_group_info_by_group_id.items()))
print(first_key, first_value)

FIN7 [{'hash': '000c1ee0c1bab222569623c47397f634b77c4124ad8bb0b0d2533ec98fcc6a16', 'cves': ['CVE-2020-1472'], 'url': 'https://attack.mitre.org/groups/G0046/'}, {'hash': '33a41a05c070a7a339f8f76bb20cc7f79aa9f19359fe4298c162adea20ccde1d', 'cves': ['CVE-2012-0158', 'CVE-2012-2539'], 'url': 'https://www.group-ib.com/resources/threat-research/Anunak_APT_against_financial_institutions.pdf'}, {'hash': '791661450f54c3e286e3140373d3df095d7b27bf052a4f1d4d8f37472f4dbb94', 'cves': ['CVE-2014-4114', 'CVE-2015-1641', 'CVE-2015-1770', 'CVE-2015-2545'], 'url': 'https://www.crowdstrike.com/blog/arrests-put-new-focus-on-carbon-spider-adversary-group/'}, {'hash': '7d2d812d40a62f3288883afcb14ee275716af1871f68587e12ed2485cddf9b9d', 'cves': ['CVE-2012-0158', 'CVE-2013-3660', 'CVE-2013-3906'], 'url': 'https://media.kasperskycontenthub.com/wp-content/uploads/sites/43/2018/03/08064518/Carbanak_APT_eng.pdf'}, {'hash': 'ebdada9350107eba5023f3fa116379b04ff1302d8ffdb62f70d1c4d6596771b0', 'cves': ['CVE-2017-11882']

In [46]:
from collections import Counter

def compute_cve_statistics(group_to_data):
    """
    Compute statistics about the CVEs, groups, and other relevant data.
    Identify CVEs that are unique to a group (excluding 'Unknown') and 
    count the top 10 most occurring CVEs.

    Parameters
    ----------
    group_to_data : dict
        A dictionary where the key is the group name, and the value contains hashes, CVEs, and URLs for that group.

    Returns
    -------
    dict
        A dictionary containing the computed statistics, including groups with unique CVEs and top 10 common CVEs.
    """
    cve_to_groups = {}  # Map each CVE to a set of groups
    group_unique_cves = {}  # Store the unique CVEs for each group
    cve_counter = Counter()  # Count how many times each CVE occurs

    # Iterate over each group and their data
    for group, data_list in group_to_data.items():
        group_cves = set()  # Track the CVEs for this group
        for entry in data_list:
            cves = entry.get('cves', [])
            for cve in cves:
                group_cves.add(cve)
                if cve not in cve_to_groups:
                    cve_to_groups[cve] = set()
                cve_to_groups[cve].add(group)
                cve_counter[cve] += 1  # Count the occurrence of this CVE
        
        # Store all CVEs for this group temporarily
        group_unique_cves[group] = group_cves

    # Now find which CVEs are unique to a group, excluding 'Unknown'
    final_unique_cves_per_group = {}
    for group, cves in group_unique_cves.items():
        if group.lower() == 'unknown':
            continue  # Skip the 'Unknown' group
        unique_cves = {cve for cve in cves if len(cve_to_groups[cve]) == 1}
        if unique_cves:
            final_unique_cves_per_group[group] = unique_cves

    # Limit the list to only 10 groups for analysis
    limited_unique_cves_per_group = dict(list(final_unique_cves_per_group.items())[:10])

    # Compute the number of groups with unique CVEs
    num_groups_with_unique_cves = len(final_unique_cves_per_group)

    # Find the top 10 most common CVEs
    top_10_common_cves = cve_counter.most_common(10)

    # Gather statistics
    statistics = {
        "num_groups_with_unique_cves": num_groups_with_unique_cves,
        "unique_cves_per_group": limited_unique_cves_per_group,
        "top_10_common_cves": top_10_common_cves
    }

    return statistics


# Example usage with the combined data
statistics = compute_cve_statistics(combined_cve_group_info_by_group_id)

# Print the statistics
print(f"Number of groups with unique CVEs (excluding 'Unknown'): {statistics['num_groups_with_unique_cves']}")
print("Unique CVEs for up to 10 groups:")
for group, cves in statistics['unique_cves_per_group'].items():
    print(f"  Group: {group}")
    print(f"    Unique CVEs: {cves}")

print("\nTop 10 most common CVEs across groups:")
for cve, count in statistics['top_10_common_cves']:
    print(f"  {cve}: {count} occurrences")


Number of groups with unique CVEs (excluding 'Unknown'): 239
Unique CVEs for up to 10 groups:
  Group: HAFNIUM
    Unique CVEs: {'CVE-2020-147212', 'CVE-2021-337716', 'CVE-2022-27511'}
  Group: win.8t_dropper
    Unique CVEs: {'CVE-2018-8570'}
  Group: win.crat
    Unique CVEs: {'CVE-2018-9375'}
  Group: win.cobalt_strike
    Unique CVEs: {'CVE-2019-0567', 'CVE-2020-116511', 'CVE-2021-36798', 'CVE-2009-3960', 'CVE-2021-26868', 'CVE-2017-11318', 'CVE-2021-1844'}
  Group: win.clop
    Unique CVEs: {'CVE-2020-12061', 'CVE-2023-35036'}
  Group: elf.mirai
    Unique CVEs: {'CVE-2017-16725', 'CVE-2020-29957', 'CVE-2021-1498', 'CVE-2020-7115', 'CVE-2018-15716', 'CVE-2020-29557', 'CVE-2021-22991', 'CVE-2018-19276', 'CVE-2020-1937', 'CVE-2020-1956', 'CVE-2016-5674', 'CVE-2017-18377', 'CVE-2021-38649', 'CVE-2021-25502', 'CVE-2020-1017', 'CVE-2021-38645', 'CVE-2019-16057', 'CVE-2019-7276', 'CVE-2013-2251', 'CVE-2009-4490', 'CVE-2021-38648'}
  Group: win.shadowpad
    Unique CVEs: {'CVE-2018-8872'

In [None]:
Total number of CVEs: 11336
Total unique CVEs: 2019
Unique groups (excluding 'Unknown'): 258
Most common CVEs: [('CVE-2017-11882', 263), ('CVE-2021-44228', 183), ('CVE-2017-0199', 155), ('CVE-2022-30190', 152), ('CVE-2012-0158', 135),
                   ('CVE-2021-26855', 124), ('CVE-2021-27065', 103), ('CVE-2020-1472', 85), ('CVE-2019-19781', 84), ('CVE-2022-26134', 81)