In [38]:
import os
import json

# Load the config file (if you have a config.json file)
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

In [39]:
# Function to read JSONL file
def read_jsonl(file_path):
    """
    Reads a JSONL file and returns a list of JSON objects.

    Parameters
    ----------
    file_path : str
        The path to the JSONL file.

    Returns
    -------
    list
        A list of JSON objects.
    """
    data = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))

    return data


# Use os.path.join to construct paths properly
base_dir = os.getcwd()
malpedia_2022_path = os.path.join(base_dir, *config['jsonl_files']['Malpedia_2022'].split('/'))
malpedia_2024_path = os.path.join(base_dir, *config['jsonl_files']['Malpedia_2024'].split('/'))

# Print to verify
print(f"2022 path: {malpedia_2022_path}")
print(f"2024 path: {malpedia_2024_path}")

# Read the files
malpedia_2022_data = read_jsonl(malpedia_2022_path)
malpedia_2024_data = read_jsonl(malpedia_2024_path)

# Combine
combined_data = malpedia_2022_data + malpedia_2024_data
print(f"Total number of entries: {len(combined_data)}")
print("First entry from the combined data:")
print(combined_data[0])


2022 path: C:\Users\ricewater\Documents\CTITTP\Malpedia Bib files Analysis\data\malpedia-db_2022-07-18_downloader.jsonl
2024 path: C:\Users\ricewater\Documents\CTITTP\Malpedia Bib files Analysis\data\20241204_downloads.jsonl
Total number of entries: 15768
First entry from the combined data:
{'author': 'CERT Division', 'date': '2000', 'download_mime': 'text/html', 'download_redirects': ['https://resources.sei.cmu.edu/library/asset-view.cfm?assetID=496186'], 'download_sha256': 'f2c405b383ebaf4d0793f8d5162841b953d06947a711f7d34242faa20e285a04', 'download_size': 41745, 'download_status': 200, 'download_ts': '2022-07-19 12:43:41.400938+00:00', 'language': 'English', 'organization': 'Carnegie Mellon University', 'origin': ['malpedia:CarnegieMellonUniversity'], 'title': '2000 CERT Advisories', 'url': 'https://resources.sei.cmu.edu/library/asset-view.cfm?assetID=496186'}


In [41]:
def get_file_hashes(directories):
    """
    Extracts file hashes from filenames in the given directories.

    Parameters
    ----------
    directories : list
        A list of directory paths containing the files.

    Returns
    -------
    list
        A list of file hashes extracted from the filenames.
    """
    file_hashes = []
    
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith(".iocs"):
                # Extract the hash from the filename (everything before '.iocs')
                file_hash = filename.split('.')[0]
                file_hashes.append(file_hash)

            elif filename.endswith(".download.iocs"):
                # Remove the '.download.iocs' suffix from the filename
                file_hash = filename.removesuffix(".download.iocs")
                file_hashes.append(file_hash)

    # Remove empty values or invalid hashes
    #file_hashes = [h for h in file_hashes if h.strip()]
    
    return file_hashes

In [48]:
# Get directories from the config ## use CVE_Malpedia if querying CVE
directories = [
    config["directory_paths"]["TTP_Malpedia_2022"],
    config["directory_paths"]["TTP_Malpedia_2024"]
]

# Extract file hashes from the directory
file_hashes = get_file_hashes(directories)

In [49]:
len(file_hashes)

1650

In [50]:
# Function to map file hash to url
def create_hash_to_url_map(file_hashes, combined_data):
    """
    Maps file hashes to URLs based on the download_sha256 field in the combined_data.

    Parameters
    ----------
    file_hashes : list
        A list of file hashes.
    combined_data : list
        A list of dictionaries (combined JSONL data) with download_sha256 and url fields.

    Returns
    -------
    dict
        A dictionary mapping file hashes to URLs.
    """
    hash_to_url = {}
    for file_hash in file_hashes:
        for entry in combined_data:
            if entry.get("download_sha256") == file_hash:
                hash_to_url[file_hash] = entry.get("url")
                break  # Stop searching once we find the match for the hash
    return hash_to_url

In [51]:
# Create the mapping of file hashes to URLs
hash_to_url_map = create_hash_to_url_map(file_hashes, combined_data)

first_key, first_value = next(iter(hash_to_url_map.items()))
print(first_key, first_value)

00090904f5cf8855553fb323ee9a1d1fc089e75c948f560ed5b95eaa914a61de https://www.infinitumit.com.tr/en/conti-ransomware-group-behind-the-karakurt-hacking-team/


In [52]:
hash_to_url_map.get("cb10915f45e3c27ccd203dd3f69aad162802d8db568c9010ee696ff631caa41e")

'https://www.clearskysec.com/wp-content/uploads/2021/01/Lebanese-Cedar-APT.pdf'

In [53]:
##Common file hashes found: {'bd90e5d64d43cd326049d739d519c270d9f2856db6c1d140569f152b0fa3b757', 
##'acd626acf50af8e30a681ccf88662b2bcecd5ec6053c18d6b460a42d9d726764', 
##'a71555ff127721ad3f47e0427411dde35ec792889c2778ba43571d3a4b3f5cca'}
unique_hashes = len(hash_to_url_map)
print(f"Total unique file hashes: {unique_hashes}")

# Check if the URLs are unique
urls = list(hash_to_url_map.values())
unique_urls = len(set(urls))

print(f"Total URLs: {len(urls)}")
print(f"Total unique URLs: {unique_urls}")

# Check if there are any duplicate URLs
if len(urls) == unique_urls:
    print("All URLs are unique.")
else:
    print(f"There are {len(urls) - unique_urls} duplicate URLs.")

    # Find and print duplicate URLs
    url_counts = {}
    
    # Count the occurrences of each URL
    for url in urls:
        if url in url_counts:
            url_counts[url] += 1
        else:
            url_counts[url] = 1

    # Filter out the duplicate URLs (those that appear more than once)
    duplicate_urls = {url: count for url, count in url_counts.items() if count > 1}

    # print("Duplicate URLs and their occurrences:")
    # for url, count in duplicate_urls.items():
    #     print(f"URL: {url} -> Occurrences: {count}")

    # #Find the hashes associated with the duplicate URLs
    # print("\nHashes associated with duplicate URLs:")
    # for hash_key, url in hash_to_url_map.items():
    #     if url in duplicate_urls:
    #         print(f"Hash: {hash_key} -> URL: {url}")

Total unique file hashes: 1639
Total URLs: 1639
Total unique URLs: 1619
There are 20 duplicate URLs.


In [54]:
# Function to load the actors data from the actors_data.json file and malware families data from malware_families.json
def load_actors_data(file_path):
    """
    Loads the actors data from a JSON file.

    Parameters
    ----------
    file_path : str
        The path to the actors_data.json file.

    Returns
    -------
    dict
        A dictionary containing the actors data.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        actors_data = json.load(f)
    return actors_data

def load_families_data(file_path):
    """
    Loads the malware families data from a JSON file.

    Parameters
    ----------
    file_path : str
        The path to the actors_data.json file.

    Returns
    -------
    dict
        A dictionary containing the families data.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        families_data = json.load(f)
    return families_data

In [55]:
import re
# Function to normalize group names
def normalize_group_name(name):
    # Convert to lowercase for case-insensitive comparison
    name = name.lower().strip()

    # Remove 'team' from names like 'Sandworm Team'
    if name.endswith(' team'):
        name = name.replace(' team', '')

    # Replace 'threat group-' with 'tg-' (e.g., 'Threat Group-1314' -> 'TG-1314')
    name = re.sub(r'threat group[- ]', 'tg-', name)

    # Remove 'temp.' or similar prefixes (e.g., 'Temp.Pittytiger' -> 'Pittytiger')
    name = re.sub(r'^temp[\. ]+', '', name)

    # Normalize spaces and dots (e.g., 'pitty tiger' == 'pitty.tiger')
    name = re.sub(r'[\. ]+', ' ', name)

    # Remove common suffixes like 'framework' or 'group' (e.g., 'Inception Framework' -> 'Inception')
    name = re.sub(r' (framework|group)$', '', name)

    # Standardize 'Confucius' and 'Confucious' to 'confucius'
    name = re.sub(r'confucious', 'confucius', name)

    return name


In [60]:
def create_url_to_group_map(actors_data, families_data):
    """
    Creates a mapping from URLs to group names based on the actors and families data,
    and records the origin of each mapping: 'actor', 'attribution', or 'family'.

    Returns
    -------
    dict
        A dictionary mapping each URL to a dict with 'group' and 'source'.
    """
    url_to_group_map = {}

    # First: process actors_data
    for group_name, group_info in actors_data.items():
        refs = group_info.get("meta", {}).get("refs", [])
        normalized_name = normalize_group_name(group_name)

        for url in refs:
            if url not in url_to_group_map:
                url_to_group_map[url] = {
                    "group": normalized_name,
                    "source": "actor"
                }
            else:
                # Support multiple mappings from different sources
                existing = url_to_group_map[url]
                if isinstance(existing["group"], list):
                    existing["group"].append(normalized_name)
                else:
                    existing["group"] = [existing["group"], normalized_name]
                # Keep the first source, or you can also store all sources as a list if needed

    # Then: process families_data
    for family_name, family_info in families_data.items():
        for url in family_info.get("urls", []):
            if url in url_to_group_map:
                continue  # already mapped via actor

            attribution = family_info.get("attribution", [])
            if attribution:
                mapped_names = [normalize_group_name(attr) for attr in attribution]
                source_type = "attribution"
            else:
                mapped_names = [family_name]
                source_type = "family"

            group_value = mapped_names if len(mapped_names) > 1 else mapped_names[0]
            url_to_group_map[url] = {
                "group": group_value,
                "source": source_type
            }

    return url_to_group_map


In [61]:
##Load the actors data
actors_data_file = "actors_data.json"  # Update with the actual file path
actors_data = load_actors_data(actors_data_file)
families_data_file = "malware_families.json"
families_data = load_families_data(families_data_file)
# Create URL-to-group mapping
url_to_group_map = create_url_to_group_map(actors_data, families_data)


In [62]:
from collections import Counter
source_counter = Counter()
for entry in url_to_group_map.values():
    source = entry.get("source", "unknown")
    source_counter[source] += 1

In [59]:
source_counter

Counter({'family': 6640, 'attribution': 5104, 'actor': 2557})

In [70]:
def update_group_to_hash_url_map(hash_to_url_map, url_to_group_map):
    """
    Updates the group-to-hash-and-url map with group names as keys.
    Each entry includes:
      - list of file hashes
      - list of URLs with their source attribution ('actor', 'attribution', 'family')

    If a URL is associated with multiple groups, it's assigned to 'Unknown'.

    Parameters
    ----------
    hash_to_url_map : dict
        A dictionary mapping file hashes to URLs.

    url_to_group_map : dict
        A dictionary mapping URLs to dicts with keys 'group' and 'source'.

    Returns
    -------
    dict
        A dictionary where group names are keys and values include lists of hashes and URL-source pairs.
    """
    group_to_hash_url_map = {}

    for file_hash, url in hash_to_url_map.items():
        group_entry = url_to_group_map.get(url)

        if not group_entry:
            group_name = "Unknown"
            source = "unknown"
        else:
            group_info = group_entry.get("group", "Unknown")
            source = group_entry.get("source", "unknown")

            if isinstance(group_info, list):
                group_name = "Unknown" if len(group_info) > 1 else group_info[0]
            else:
                group_name = group_info

        # Initialize if not yet present
        if group_name not in group_to_hash_url_map:
            group_to_hash_url_map[group_name] = {
                "hashes": [],
                "urls": []
            }

        group_to_hash_url_map[group_name]["hashes"].append(file_hash)
        group_to_hash_url_map[group_name]["urls"].append({
            "url": url,
            "source": source
        })

    return group_to_hash_url_map


In [71]:
group_to_hash_url_map = update_group_to_hash_url_map(hash_to_url_map, url_to_group_map)
print(len(group_to_hash_url_map))
#first_key, first_value = next(iter(group_to_hash_url_map.items()))
#print(first_key, first_value)

591


In [73]:
def calculate_statistics(group_map):
    """
    Calculate statistics for the group-to-hash-url map.

    Parameters
    ----------
    group_map : dict
        A dictionary where group names are keys. Each value is a dict with:
        - 'hashes': list of hashes
        - 'urls': list of dicts with keys 'url' and 'source'

    Returns
    -------
    dict
        A dictionary with:
        - num_unique_groups
        - num_unique_families
        - num_hashes
        - num_unique_urls
        - unknown_groups_count
    """
    unique_urls = set()
    num_hashes = 0
    unknown_groups_count = 0

    unique_group_names = set()
    unique_family_names = set()

    for group_name, data in group_map.items():
        urls = data.get("urls", [])
        hashes = data.get("hashes", [])

        num_hashes += len(hashes)
        unique_urls.update(url_entry['url'] for url_entry in urls)

        if group_name.lower() == "unknown":
            unknown_groups_count += len(hashes)
            continue

        # Determine source types
        for url_entry in urls:
            source = url_entry.get("source", "unknown")
            if source == "family":
                unique_family_names.add(group_name)
            else:  # actor or attribution
                unique_group_names.add(group_name)

    return {
        "num_unique_groups": len(unique_group_names),
        "num_unique_families": len(unique_family_names),
        "num_hashes": num_hashes,
        "num_unique_urls": len(unique_urls),
        "unknown_groups_count": unknown_groups_count
    }


In [74]:
stats = calculate_statistics(group_to_hash_url_map)

print(f"Number of unique groups (actor/attribution): {stats['num_unique_groups']}")
print(f"Number of unique families: {stats['num_unique_families']}")
print(f"Total number of hashes: {stats['num_hashes']}")
print(f"Total unique URLs: {stats['num_unique_urls']}")
print(f"Number of hashes with 'Unknown' group: {stats['unknown_groups_count']}")


Number of unique groups (actor/attribution): 215
Number of unique families: 375
Total number of hashes: 1639
Total unique URLs: 1619
Number of hashes with 'Unknown' group: 503


In [78]:
def get_file_hashes_and_indicators(directories, indicator_type="cves"):
    """
    Extracts file hashes from filenames and reads the content of the files to
    extract either CVEs or TTPs associated with each hash, based on the indicator_type parameter.

    Parameters
    ----------
    directories : list
        A list of directory paths containing the files.
    indicator_type : str, optional
        The type of indicators to extract. Accepts "cve" or "ttp". Default is "cve".

    Returns
    -------
    dict
        A dictionary where the key is the file hash, and the value is a list
        of CVEs or TTPs associated with that hash.
    """
    hash_to_indicators = {}
    
    for directory in directories:
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            
            if indicator_type == "cves" and filename.endswith(".iocs"):
                file_hash = filename.split('.')[0]
                indicators = []
                
                with open(file_path, 'r', encoding='utf-8') as file:
                    for line in file:
                        parts = line.strip().split("\t")
                        if parts[0] == "cve" and len(parts) > 1:
                            indicators.append(parts[1])
                
                if indicators:
                    hash_to_indicators[file_hash] = indicators

            elif indicator_type == "ttps" and filename.endswith(".download.iocs"):
                # Remove the '.download.iocs' suffix from the filename
                file_hash = filename.removesuffix(".download.iocs")
                file_hashes.append(file_hash) # Remove ".download.iocs" from filename
                indicators = []
                
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                    for line in file:
                        parts = line.strip().split("\t")
                        if len(parts) > 1 and parts[0] == "ttp":
                            indicators.append(parts[1])
                
                if indicators:
                    hash_to_indicators[file_hash] = indicators
    
    return hash_to_indicators

In [79]:
#cve_data = get_file_hashes_and_indicators(directories, "cves")
ttp_data = get_file_hashes_and_indicators(directories, "ttps")


print(len(ttp_data))

1639


In [81]:
def update_group_hash_with_data(group_hash_to_url_map, data, data_type):
    """
    Updates the group_hash_to_url_map by adding TTPs or CVEs from the respective data dictionary.
    
    Parameters
    ----------
    group_hash_to_url_map : dict
        A dictionary where each group contains hashes and URLs.
    data : dict
        A dictionary where the keys are hashes and the values are lists of TTPs or CVEs.
    data_type : str
        The type of data being processed ('cve' or 'ttp').
    
    Returns
    -------
    dict
        The updated group_hash_to_url_map with TTPs or CVEs added for each hash.
    """
    if data_type not in ('cves', 'ttps'):
        raise ValueError("Invalid data_type. Must be 'cve' or 'ttp'.")
    
    for group_name, group_data in group_hash_to_url_map.items():
        updated_hashes = []
        
        for hash_value in group_data['hashes']:
            if hash_value in data:
                updated_hashes.append({
                    "hash": hash_value,
                    data_type: data[hash_value]  # Dynamically set key as 'cve' or 'ttp'
                })
            else:
                print(f"Hash {hash_value} not found in {data_type}_data.")
                updated_hashes.append({
                    "hash": hash_value,
                    data_type: []
                })
        
        group_hash_to_url_map[group_name]['hashes'] = updated_hashes
    
    return group_hash_to_url_map


In [82]:
# Call the function to update the map  --> May need to restart it if already ran once or else it flashes error


updated_data = update_group_hash_with_data(group_to_hash_url_map, ttp_data, "ttps")

#updated_data = update_group_hash_with_data(group_to_hash_url_map, cve_data, "cves")

In [28]:
def save_json(data, data_type):
    """
    Saves the given data to a JSON file with a dynamic filename based on the data type.

    Parameters
    ----------
    data : dict
        The dictionary data to be saved.
    data_type : str
        The type of data being saved, either 'cve' or 'ttp'.
    """
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    return filename  # Return filename for further use

def count_json_keys(json_file_path):
    """
    Reads a JSON file and counts the number of keys in it.

    Parameters
    ----------
    json_file_path : str
        The path to the JSON file.

    Returns
    -------
    int
        The number of keys in the JSON file.
    """
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    return len(data)

# Example usage
data_type = "cve"
#data_type = "ttp" # Change to "cve" when dealing with CVE data
filename = f"Malpedia_{data_type}_group_analysis.json"
filename = save_json(updated_data, data_type)

num_keys = count_json_keys(filename)
print(f"Number of keys in the JSON file: {num_keys}")


Number of keys in the JSON file: 591


In [83]:
from collections import Counter, defaultdict

def analyze_data(group_data_map, data_type="cves"):
    """
    Analyzes CVEs or TTPs across groups with breakdowns for actor/attribution and family sources.

    Parameters
    ----------
    group_data_map : dict
        Dictionary with group names as keys. Each value contains:
        - 'hashes': list of dicts with 'hash' and data_type (e.g., 'cves')
        - 'urls': list of dicts with 'url' and 'source'

    data_type : str
        Type of data to analyze: 'cves' or 'ttps'

    Returns
    -------
    dict
        Dictionary with analysis statistics.
    """
    all_items = []
    group_item_map = defaultdict(set)
    group_source_type = {}  # group_id -> "family" or "group" (actor/attribution)

    for group_id, group_data in group_data_map.items():
        # Determine if group is family-based or actor/attribution
        sources = {entry["source"] for entry in group_data.get("urls", [])}
        if sources == {"family"}:
            group_source_type[group_id] = "family"
        else:
            group_source_type[group_id] = "group"

        # Collect data items
        for hash_data in group_data["hashes"]:
            items = hash_data.get(data_type, [])
            group_item_map[group_id].update(items)
            all_items.extend(items)

    # === Global stats ===
    total_items = len(all_items)
    unique_items = set(all_items)
    total_unique_items = len(unique_items)
    item_counter = Counter(all_items)
    top_10_items = item_counter.most_common(10)


    # === Per-group unique items (actor/attribution only, excluding "Unknown") ===
    unique_items_per_group = {}
    for group_id, items in group_item_map.items():
        if group_source_type.get(group_id) != "group" or group_id.lower() == "unknown":
            continue  # Skip family and 'Unknown' groups
    
        other_items = set()
        for other_group, other_group_items in group_item_map.items():
            if other_group != group_id:
                other_items.update(other_group_items)
    
        unique = items - other_items
        if unique:
            unique_items_per_group[group_id] = list(unique)


    # === Per-item group appearances ===
    item_group_count = Counter()
    for item in unique_items:
        for group_id, items in group_item_map.items():
            if item in items:
                item_group_count[item] += 1
    top_10_common_across_groups = item_group_count.most_common(10)

    # === Source-aware stats ===
    group_items = []
    family_items = []

    for group_id, items in group_item_map.items():
        if group_source_type[group_id] == "group":
            group_items.extend(items)
        elif group_source_type[group_id] == "family":
            family_items.extend(items)

    top_10_group_only = Counter(group_items).most_common(10)
    top_10_family_only = Counter(family_items).most_common(10)

    # === Output summary ===
    print(f"Total number of {data_type.upper()}: {total_items}")
    print(f"Total number of unique {data_type.upper()}: {total_unique_items}")

    print(f"\nTop 10 most common {data_type.upper()} (overall occurrences):")
    for item, count in top_10_items:
        print(f"{item}: {count}")

    print(f"\nTop 10 most common {data_type.upper()} (across groups):")
    for item, count in top_10_common_across_groups:
        print(f"{item}: {count} groups")

    print(f"\nTop 10 {data_type.upper()} in groups (actor/attribution):")
    for item, count in top_10_group_only:
        print(f"{item}: {count}")

    print(f"\nTop 10 {data_type.upper()} in families:")
    for item, count in top_10_family_only:
        print(f"{item}: {count}")

    print(f"\n{data_type.upper()} count per group:")
    # Uncomment below if you want a full printout
    # for group_id, count in count_per_group.items():
    #     print(f"{group_id}: {count}")

    print(f"\nNumber of groups with unique {data_type.upper()}: {len(unique_items_per_group)}")

    # Show a few example groups and their unique items
    print(f"\nExample groups with unique {data_type.upper()}:")
    for group_id, uniques in list(unique_items_per_group.items())[:10]:  # Show first 5
        print(f"  Group: {group_id}")
        for item in uniques[:5]:  # Show up to 5 CVEs/TTPs per group
            print(f"    - {item}")
        if len(uniques) > 5:
            print(f"    ... and {len(uniques) - 5} more")



    return {
        "total_count": total_items,
        "unique_count": total_unique_items,
        "top_10_overall": top_10_items,
        "top_10_across_groups": top_10_common_across_groups,
        "top_10_groups_only": top_10_group_only,
        "top_10_families_only": top_10_family_only,
        "unique_per_group": unique_items_per_group,
        #"count_per_group": count_per_group
    }


In [84]:
# Example usage
#cve_analysis = analyze_data(updated_data, data_type="cves")
ttp_analysis = analyze_data(updated_data, data_type="ttps")

Total number of TTPS: 24457
Total number of unique TTPS: 879

Top 10 most common TTPS (overall occurrences):
T1082: 531
T1083: 455
T1140: 430
T1027: 419
T1059: 368
T1041: 334
T1105: 327
T1057: 325
T1486: 320
T1071.001: 303

Top 10 most common TTPS (across groups):
T1082: 269 groups
T1083: 243 groups
T1140: 236 groups
T1027: 216 groups
T1059: 187 groups
T1041: 176 groups
T1105: 170 groups
T1057: 168 groups
T1071.001: 149 groups
T1486: 149 groups

Top 10 TTPS in groups (actor/attribution):
T1082: 102
T1027: 101
T1140: 100
T1083: 97
T1059: 97
T1105: 88
T1041: 79
T1071: 75
T1057: 71
T1005: 70

Top 10 TTPS in families:
T1082: 167
T1083: 146
T1140: 136
T1486: 116
T1027: 115
T1057: 97
T1041: 97
T1059: 90
T1112: 86
T1071.001: 83

TTPS count per group:

Number of groups with unique TTPS: 33

Example groups with unique TTPS:
  Group: ta505
    - T1488
    - T1138
  Group: kimsuky
    - T1035.005
  Group: apt32
    - T1493
    - T1216.001
    - T1150
  Group: the gorgon
    - T1011
  Group: turla