In [1]:
import os
import json

# Load the config file (if you have a config.json file)
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

In [2]:
# Function to read JSONL file
def read_jsonl(file_path):
    """
    Reads a JSONL file and returns a list of JSON objects.

    Parameters
    ----------
    file_path : str
        The path to the JSONL file.

    Returns
    -------
    list
        A list of JSON objects.
    """
    data = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    
    return data

# File paths for the JSONL files
malpedia_2022_file = config['jsonl_files']['Malpedia_2022']
malpedia_2024_file = config['jsonl_files']['Malpedia_2024']

# Ensure the file paths are relative to the running script's directory
malpedia_2022_path = os.path.join(os.getcwd(), malpedia_2022_file)
malpedia_2024_path = os.path.join(os.getcwd(), malpedia_2024_file)

# Read the files
malpedia_2022_data = read_jsonl(malpedia_2022_path)
malpedia_2024_data = read_jsonl(malpedia_2024_path)

# Concatenate the data from both JSONL files
combined_data = malpedia_2022_data + malpedia_2024_data

# Example: print the length of the combined data and the first entry
print(f"Total number of entries: {len(combined_data)}")
print("First entry from the combined data:")
print(combined_data[0])

Total number of entries: 15765
First entry from the combined data:
{'author': 'CERT Division', 'date': '2000', 'download_mime': 'text/html', 'download_redirects': ['https://resources.sei.cmu.edu/library/asset-view.cfm?assetID=496186'], 'download_sha256': 'f2c405b383ebaf4d0793f8d5162841b953d06947a711f7d34242faa20e285a04', 'download_size': 41745, 'download_status': 200, 'download_ts': '2022-07-19 12:43:41.400938+00:00', 'language': 'English', 'organization': 'Carnegie Mellon University', 'origin': ['malpedia:CarnegieMellonUniversity'], 'title': '2000 CERT Advisories', 'url': 'https://resources.sei.cmu.edu/library/asset-view.cfm?assetID=496186'}


In [3]:
def get_file_hashes(directories):
    """
    Extracts file hashes from filenames in the given directories.

    Parameters
    ----------
    directories : list
        A list of directory paths containing the files.

    Returns
    -------
    list
        A list of file hashes extracted from the filenames.
    """
    file_hashes = []
    
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith(".iocs"):
                # Extract the hash from the filename (everything before '.iocs')
                file_hash = filename.split('.')[0]
                file_hashes.append(file_hash)

            elif filename.endswith(".download.iocs"):
                # Remove the '.download.iocs' suffix from the filename
                file_hash = filename.removesuffix(".download.iocs")
                file_hashes.append(file_hash)

    # Remove empty values or invalid hashes
    #file_hashes = [h for h in file_hashes if h.strip()]
    
    return file_hashes

In [4]:
# Get directories from the config ## use CVE_Malpedia if querying CVE
directories = [
    config["directory_paths"]["TTP_Malpedia_2022"],
    config["directory_paths"]["TTP_Malpedia_2024"]
]

# Extract file hashes from the directory
file_hashes = get_file_hashes(directories)

In [5]:
# Function to map file hash to url
def create_hash_to_url_map(file_hashes, combined_data):
    """
    Maps file hashes to URLs based on the download_sha256 field in the combined_data.

    Parameters
    ----------
    file_hashes : list
        A list of file hashes.
    combined_data : list
        A list of dictionaries (combined JSONL data) with download_sha256 and url fields.

    Returns
    -------
    dict
        A dictionary mapping file hashes to URLs.
    """
    hash_to_url = {}
    for file_hash in file_hashes:
        for entry in combined_data:
            if entry.get("download_sha256") == file_hash:
                hash_to_url[file_hash] = entry.get("url")
                break  # Stop searching once we find the match for the hash
    return hash_to_url

In [6]:
# Create the mapping of file hashes to URLs
hash_to_url_map = create_hash_to_url_map(file_hashes, combined_data)

first_key, first_value = next(iter(hash_to_url_map.items()))
print(first_key, first_value)

00090904f5cf8855553fb323ee9a1d1fc089e75c948f560ed5b95eaa914a61de https://www.infinitumit.com.tr/en/conti-ransomware-group-behind-the-karakurt-hacking-team/


In [7]:
len(hash_to_url_map)

1637

In [8]:
##Common file hashes found: {'bd90e5d64d43cd326049d739d519c270d9f2856db6c1d140569f152b0fa3b757', 
##'acd626acf50af8e30a681ccf88662b2bcecd5ec6053c18d6b460a42d9d726764', 
##'a71555ff127721ad3f47e0427411dde35ec792889c2778ba43571d3a4b3f5cca'}
unique_hashes = len(hash_to_url_map)
print(f"Total unique file hashes: {unique_hashes}")

# Check if the URLs are unique
urls = list(hash_to_url_map.values())
unique_urls = len(set(urls))

print(f"Total URLs: {len(urls)}")
print(f"Total unique URLs: {unique_urls}")

# Check if there are any duplicate URLs
if len(urls) == unique_urls:
    print("All URLs are unique.")
else:
    print(f"There are {len(urls) - unique_urls} duplicate URLs.")

    # Find and print duplicate URLs
    url_counts = {}
    
    # Count the occurrences of each URL
    for url in urls:
        if url in url_counts:
            url_counts[url] += 1
        else:
            url_counts[url] = 1

    # Filter out the duplicate URLs (those that appear more than once)
    duplicate_urls = {url: count for url, count in url_counts.items() if count > 1}

    # print("Duplicate URLs and their occurrences:")
    # for url, count in duplicate_urls.items():
    #     print(f"URL: {url} -> Occurrences: {count}")

    # # Find the hashes associated with the duplicate URLs
    # print("\nHashes associated with duplicate URLs:")
    # for hash_key, url in hash_to_url_map.items():
    #     if url in duplicate_urls:
    #         print(f"Hash: {hash_key} -> URL: {url}")

Total unique file hashes: 1637
Total URLs: 1637
Total unique URLs: 1617
There are 20 duplicate URLs.


In [9]:
# Function to load the actors data from the actors_data.json file and malware families data from malware_families.json
def load_actors_data(file_path):
    """
    Loads the actors data from a JSON file.

    Parameters
    ----------
    file_path : str
        The path to the actors_data.json file.

    Returns
    -------
    dict
        A dictionary containing the actors data.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        actors_data = json.load(f)
    return actors_data

def load_families_data(file_path):
    """
    Loads the malware families data from a JSON file.

    Parameters
    ----------
    file_path : str
        The path to the actors_data.json file.

    Returns
    -------
    dict
        A dictionary containing the families data.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        families_data = json.load(f)
    return families_data

In [10]:
import re
# Function to normalize group names
def normalize_group_name(name):
    # Convert to lowercase for case-insensitive comparison
    name = name.lower().strip()

    # Remove 'team' from names like 'Sandworm Team'
    if name.endswith(' team'):
        name = name.replace(' team', '')

    # Replace 'threat group-' with 'tg-' (e.g., 'Threat Group-1314' -> 'TG-1314')
    name = re.sub(r'threat group[- ]', 'tg-', name)

    # Remove 'temp.' or similar prefixes (e.g., 'Temp.Pittytiger' -> 'Pittytiger')
    name = re.sub(r'^temp[\. ]+', '', name)

    # Normalize spaces and dots (e.g., 'pitty tiger' == 'pitty.tiger')
    name = re.sub(r'[\. ]+', ' ', name)

    # Remove common suffixes like 'framework' or 'group' (e.g., 'Inception Framework' -> 'Inception')
    name = re.sub(r' (framework|group)$', '', name)

    # Standardize 'Confucius' and 'Confucious' to 'confucius'
    name = re.sub(r'conficious', 'confucius', name)

    return name


In [11]:
def create_url_to_group_map(actors_data, families_data):
    """
    Creates a mapping from URLs to group names based on the actors data.
    If a URL is tagged to multiple actors, it is stored as a list of names.

    When processing families_data:
    - If the URL is already mapped in actors_data, it is skipped.
    - If no actor mappings exist, the attribution field is checked.
    - If no attribution is present, the family name is used.

    Parameters
    ----------
    actors_data : dict
        A dictionary containing the actors data.
    families_data : dict
        A dictionary containing the families data.

    Returns
    -------
    dict
        A dictionary mapping URLs to a list of group names.
    """
    url_to_group_map = {}

    # First, check the actors_data for URLs
    for group_name, group_info in actors_data.items():
        refs = group_info.get("meta", {}).get("refs", [])
        normalized_name = normalize_group_name(group_name)

        for url in refs:
            if url in url_to_group_map:
                if isinstance(url_to_group_map[url], list):
                    url_to_group_map[url].append(normalized_name)
                else:
                    url_to_group_map[url] = [url_to_group_map[url], normalized_name]
            else:
                url_to_group_map[url] = normalized_name

    # Check families_data only for URLs NOT in actors_data
    for family_name, family_info in families_data.items():
        for url in family_info.get("urls", []):
            if url in url_to_group_map:
                # Skip processing if already found in actors_data
                continue  

            attribution = family_info.get("attribution", [])
            mapped_names = [normalize_group_name(attr) for attr in attribution] if attribution else [family_name]

            # Store as a list even if there's only one name for consistency
            url_to_group_map[url] = mapped_names if len(mapped_names) > 1 else mapped_names[0]

    return url_to_group_map


In [12]:
##Load the actors data
actors_data_file = "actors_data.json"  # Update with the actual file path
actors_data = load_actors_data(actors_data_file)
families_data_file = "malware_families.json"
families_data = load_families_data(families_data_file)
# Create URL-to-group mapping
url_to_group_map = create_url_to_group_map(actors_data, families_data)


In [13]:
def count_single_mapping_urls(url_to_group_map):
    """
    Counts the number of URLs that are mapped to only one group (threat group or family).

    Parameters
    ----------
    url_to_group_map : dict
        The final dictionary mapping URLs to group names (or lists of names).

    Returns
    -------
    int
        The count of URLs that are mapped to exactly one group.
    """
    single_mapping_count = 0

    for groups in url_to_group_map.values():
        if isinstance(groups, str) or (isinstance(groups, list) and len(groups) == 1):
            single_mapping_count += 1

    return single_mapping_count


result = count_single_mapping_urls(url_to_group_map)
print(result)

11935


In [14]:
def update_group_to_hash_url_map(hash_to_url_map, url_to_group_map):
    """
    Updates the hash-to-group map with group names as keys and hashes/URLs as associated values.
    If a URL is associated with multiple groups, it is added to the "Unknown" group.

    Parameters
    ----------
    hash_to_url_map : dict
        A dictionary mapping file hashes to URLs.
    url_to_group_map : dict
        A dictionary mapping URLs to group names (can be a list of names).

    Returns
    -------
    dict
        A dictionary where group names are keys, and associated values are lists of hashes and URLs.
    """
    group_to_hash_url_map = {}

    for file_hash, url in hash_to_url_map.items():
        group_names = url_to_group_map.get(url, ["Unknown"])  # Default to "Unknown" if no group found
        
        # If the group is a list, choose the first group or default to "Unknown" if there are multiple groups
        if isinstance(group_names, list):
            if len(group_names) > 1:
                group_name = "Unknown"  # Default to "Unknown" if multiple groups
            else:
                group_name = group_names[0]
        else:
            group_name = group_names
        
        # If group name not already in the map, initialize it
        if group_name not in group_to_hash_url_map:
            group_to_hash_url_map[group_name] = {"hashes": [], "urls": []}

        # Add the hash and URL to the corresponding group
        group_to_hash_url_map[group_name]["hashes"].append(file_hash)
        group_to_hash_url_map[group_name]["urls"].append(url)

    return group_to_hash_url_map


In [15]:
group_to_hash_url_map = update_group_to_hash_url_map(hash_to_url_map, url_to_group_map)
print(len(group_to_hash_url_map))
# first_key, first_value = next(iter(group_to_hash_url_map.items()))
# print(first_key, first_value)

589


In [16]:
# Calculate statistics
def calculate_statistics(group_map):
    """
    Calculate the statistics for the group-to-hash URL map.

    Parameters
    ----------
    group_map : dict
        A dictionary where group names are keys, and each value contains a list of hashes and URLs.

    Returns
    -------
    tuple
        A tuple containing:
        - num_unique_groups: Number of unique group names (including 'Unknown')
        - num_hashes: Total number of hashes across all groups
        - num_unique_urls: Total number of unique URLs across all groups
        - unknown_groups_count: Number of hashes associated with the 'Unknown' group
    """
    # Number of unique groups (including 'Unknown')
    num_unique_groups = len(group_map)
    
    # Number of unique URLs
    unique_urls = set()
    for data in group_map.values():
        unique_urls.update(data['urls'])
    num_unique_urls = len(unique_urls)
    
    # Length of the hash-to-group map (total hashes)
    num_hashes = sum(len(data['hashes']) for data in group_map.values())
    
    # Number of hashes with 'Unknown' group
    unknown_groups_count = len(group_map.get('Unknown', {}).get('hashes', []))
    
    return num_unique_groups, num_hashes, num_unique_urls, unknown_groups_count

# Example usage:
# Assuming 'hash_to_group_map' is the result of your update function
num_unique_groups, num_hashes, num_unique_urls, unknown_groups_count = calculate_statistics(group_to_hash_url_map)

# Print the statistics
print(f"Number of unique groups: {num_unique_groups}")
print(f"Length of the hash-to-group map: {num_hashes}")
print(f"Number of unique URLs: {num_unique_urls}")
print(f"Number of hashes with 'Unknown' group: {unknown_groups_count}")


Number of unique groups: 589
Length of the hash-to-group map: 1637
Number of unique URLs: 1617
Number of hashes with 'Unknown' group: 503


In [17]:
def get_file_hashes_and_indicators(directories, indicator_type="cves"):
    """
    Extracts file hashes from filenames and reads the content of the files to
    extract either CVEs or TTPs associated with each hash, based on the indicator_type parameter.

    Parameters
    ----------
    directories : list
        A list of directory paths containing the files.
    indicator_type : str, optional
        The type of indicators to extract. Accepts "cve" or "ttp". Default is "cve".

    Returns
    -------
    dict
        A dictionary where the key is the file hash, and the value is a list
        of CVEs or TTPs associated with that hash.
    """
    hash_to_indicators = {}
    
    for directory in directories:
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            
            if indicator_type == "cves" and filename.endswith(".iocs"):
                file_hash = filename.split('.')[0]
                indicators = []
                
                with open(file_path, 'r', encoding='utf-8') as file:
                    for line in file:
                        parts = line.strip().split("\t")
                        if parts[0] == "cve" and len(parts) > 1:
                            indicators.append(parts[1])
                
                if indicators:
                    hash_to_indicators[file_hash] = indicators

            elif indicator_type == "ttps" and filename.endswith(".download.iocs"):
                # Remove the '.download.iocs' suffix from the filename
                file_hash = filename.removesuffix(".download.iocs")
                file_hashes.append(file_hash) # Remove ".download.iocs" from filename
                indicators = []
                
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                    for line in file:
                        parts = line.strip().split("\t")
                        if len(parts) > 1 and parts[0] == "ttp":
                            indicators.append(parts[1])
                
                if indicators:
                    hash_to_indicators[file_hash] = indicators
    
    return hash_to_indicators

In [21]:
# Get directories from the config ## use CVE_Malpedia if querying CVE
directories = [
    config["directory_paths"]["TTP_Malpedia_2022"],
    config["directory_paths"]["TTP_Malpedia_2024"]
]

#cve_data = get_file_hashes_and_indicators(directories, "cves")
ttp_data = get_file_hashes_and_indicators(directories, "ttps")


print(len(ttp_data))

1637


In [22]:
def update_group_hash_with_data(group_hash_to_url_map, data, data_type):
    """
    Updates the group_hash_to_url_map by adding TTPs or CVEs from the respective data dictionary.
    
    Parameters
    ----------
    group_hash_to_url_map : dict
        A dictionary where each group contains hashes and URLs.
    data : dict
        A dictionary where the keys are hashes and the values are lists of TTPs or CVEs.
    data_type : str
        The type of data being processed ('cve' or 'ttp').
    
    Returns
    -------
    dict
        The updated group_hash_to_url_map with TTPs or CVEs added for each hash.
    """
    if data_type not in ('cves', 'ttps'):
        raise ValueError("Invalid data_type. Must be 'cve' or 'ttp'.")
    
    for group_name, group_data in group_hash_to_url_map.items():
        updated_hashes = []
        
        for hash_value in group_data['hashes']:
            if hash_value in data:
                updated_hashes.append({
                    "hash": hash_value,
                    data_type: data[hash_value]  # Dynamically set key as 'cve' or 'ttp'
                })
            else:
                print(f"Hash {hash_value} not found in {data_type}_data.")
                updated_hashes.append({
                    "hash": hash_value,
                    data_type: []
                })
        
        group_hash_to_url_map[group_name]['hashes'] = updated_hashes
    
    return group_hash_to_url_map


In [23]:
#ttp_data.get("00090904f5cf8855553fb323ee9a1d1fc089e75c948f560ed5b95eaa914a61de")

In [24]:
# Call the function to update the map  --> May need to restart it if already ran once or else it flashes error
#updated_group_hash_to_url_map = update_group_hash_with_ttps(group_to_hash_url_map, ttp_data)

updated_data = update_group_hash_with_data(group_to_hash_url_map, ttp_data, "ttps")

#updated_data = update_group_hash_with_data(group_to_hash_url_map, cve_data, "cves")



In [25]:
len(updated_data)

589

In [26]:
import json

def save_json(data, data_type):
    """
    Saves the given data to a JSON file with a dynamic filename based on the data type.

    Parameters
    ----------
    data : dict
        The dictionary data to be saved.
    data_type : str
        The type of data being saved, either 'cve' or 'ttp'.
    """
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    return filename  # Return filename for further use

def count_json_keys(json_file_path):
    """
    Reads a JSON file and counts the number of keys in it.

    Parameters
    ----------
    json_file_path : str
        The path to the JSON file.

    Returns
    -------
    int
        The number of keys in the JSON file.
    """
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    return len(data)

# Example usage
#data_type = "cve"
#data_type = "ttps" # Change to "cve" when dealing with CVE data
filename = f"Malpedia_{data_type}_group_analysis.json"
#filename = save_json(updated_data, data_type)

num_keys = count_json_keys(filename)
print(f"Number of keys in the JSON file: {num_keys}")


Number of keys in the JSON file: 589


In [32]:
#combined_cve_group_info_by_group_id

In [28]:
from collections import Counter

def analyze_data(group_data_map, data_type="cves"):
    """
    Analyzes the given group data (CVE or TTP) and provides statistical insights.

    Args:
        group_data_map (dict): Dictionary where keys are group IDs and values contain hashes and data lists.
        data_type (str): Either "cves" or "ttps" to specify the type of data being analyzed.

    Returns:
        dict: Analysis results including total count, unique count, top 10 common items, and unique items per group.
    """
    # Initialize a list to store all items (CVE/TTPs) across groups
    all_items = []
    
    # Extract all CVEs or TTPs from the data
    for group_id, group_data in group_data_map.items():
        for hash_data in group_data['hashes']:
            all_items.extend(hash_data[data_type])
    
    # Calculate total and unique counts
    total_items = len(all_items)
    unique_items = set(all_items)
    total_unique_items = len(unique_items)
    
    # Identify the top 10 most common CVEs or TTPs
    item_counter = Counter(all_items)
    top_10_items = item_counter.most_common(10)
    
    # Identify unique items per group
    unique_items_per_group = {}
    for group_id, data in group_data_map.items():
        group_items = set()
        for hash_entry in data['hashes']:
            group_items.update(hash_entry[data_type])

        # Find items unique to this group
        other_groups_items = set()
        for other_group_id, other_data in group_data_map.items():
            if other_group_id != group_id:
                for other_hash_entry in other_data['hashes']:
                    other_groups_items.update(other_hash_entry[data_type])

        unique_items = group_items - other_groups_items
        if unique_items:
            unique_items_per_group[group_id] = list(unique_items)
    
    # Print analysis results
    print(f"Total number of {data_type.upper()}: {total_items}")
    print(f"Total number of unique {data_type.upper()}: {total_unique_items}")
    print(f"Top 10 most common {data_type.upper()}:")
    for item, count in top_10_items:
        print(f"{item}: {count}")
    print(f"Number of groups with unique {data_type.upper()}: {len(unique_items_per_group)}")
    
    # Return analysis results as a dictionary
    return {
        "total_count": total_items,
        "unique_count": total_unique_items,
        "top_10": top_10_items,
        "unique_per_group": unique_items_per_group
    }

In [30]:
# Example usage
#cve_analysis = analyze_data(updated_data, data_type="cves")
ttp_analysis = analyze_data(updated_data, data_type="ttps")

Total number of TTPS: 24398
Total number of unique TTPS: 876
Top 10 most common TTPS:
T1082: 530
T1083: 454
T1140: 429
T1027: 418
T1059: 367
T1041: 333
T1105: 326
T1057: 325
T1486: 320
T1071.001: 303
Number of groups with unique TTPS: 83
