In [22]:
import json
import os
import pandas as pd
import tldextract
from collections import Counter
from urllib.parse import urlparse

In [3]:
# Step 1: Locate and load config.json from one level up
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
config_path = os.path.normpath(os.path.join(parent_dir, 'config.json'))

with open(config_path, 'r', encoding='utf-8') as f:
    config = json.load(f)

# Step 2: Build absolute path prefix based on where config.json was loaded from
config_dir = os.path.dirname(config_path)

In [4]:
def extract_fqdn_and_date(file_paths: list):
    """
    Extracts the FQDN from the 'url' field and the 'date' from the provided JSON lines
    in multiple files.

    Args:
        file_paths (list): List of paths to files containing JSON entries.

    Returns:
        list: A list of tuples containing the FQDN and date for each entry from all files.
    """
    results = []
    urls = []

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            # Read each line of the JSONL file
            for line in file:
                entry = json.loads(line)
                # Extract the FQDN from the 'url'
                url = entry.get("url", "")
                parsed_url = urlparse(url)
                urls.append(url)
                fqdn = parsed_url.netloc  # Get the FQDN part from the URL

                # Extract the 'date'
                date = entry.get("date", "")

                # Append the FQDN and date to the results
                results.append((fqdn, date))

    return results, urls

In [5]:
# File paths
malpedia_2022_jsonl = os.path.normpath(os.path.join(config_dir, config["jsonl_files"]["Malpedia_2022"]))
malpedia_2024_jsonl = os.path.normpath(os.path.join(config_dir, config["jsonl_files"]["Malpedia_2024"]))

# Extract FQDNs and dates from both jsonl files
malpedia_fqdn_and_dates, malpedia_urls = extract_fqdn_and_date([malpedia_2022_jsonl, malpedia_2024_jsonl])

# Print the count of unique FQDNs across both files
print(len(set(fqdn for fqdn, date in malpedia_fqdn_and_dates)))
print("Total unique URLs:", len(set(malpedia_urls)))

2002
Total unique URLs: 15699


In [23]:
def extract_successful_urls(file_paths: str):
    """
    Extracts the FQDN from the 'url' field only for entries where the 'download_status' is 200 (successful).

    Args:
        file_path (str): Path to the file containing JSON entries.

    Returns:
        list: A list of FQDNs for entries with a successful download status.
    """
    successful_fqdn = []
    successful_urls = []

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            # Read each line of the JSONL file
            for line in file:
                entry = json.loads(line)
    
                # Check if 'download_status' is 200
                if entry.get('download_status') == 200:
                    # Extract the FQDN from the 'url'
                    url = entry.get("url", "")
                    parsed_url = urlparse(url)
                    successful_urls.append(url)
                    fqdn = parsed_url.netloc  # Get the FQDN part from the URL
    
                    # Append the FQDN to the list
                    if fqdn:
                        successful_fqdn.append(fqdn)

    return successful_urls, successful_fqdn

In [24]:
mitre_enterprise_jsonl = os.path.normpath(os.path.join(config_dir, config["jsonl_files"]["MITRE_enterprise"]))
mitre_mobileics_jsonl = os.path.normpath(os.path.join(config_dir, config["jsonl_files"]["MITRE_mobileics"]))
combined_json_files = [mitre_enterprise_jsonl, mitre_mobileics_jsonl]



In [25]:
##The following analysis in on the URLs from jsonl files
mitre_successful_urls, mitre_successful_fqdns = extract_successful_urls(combined_json_files)
print("FQDNS:", len(set(mitre_successful_fqdns)))
print("URLS:" , len(set(mitre_successful_urls)))

FQDNS: 183
URLS: 809


In [29]:
#  ATTACK group file paths
group_paths = [
    os.path.normpath(os.path.join(config_dir, config["data_directory"], config["file_paths_groups_v15"]["enterprise"])),
    os.path.normpath(os.path.join(config_dir, config["data_directory"], config["file_paths_groups_v15"]["mobile"])),
    os.path.normpath(os.path.join(config_dir, config["data_directory"], config["file_paths_groups_v15"]["ics"]))
]

In [30]:
## The following analysis is form the citation URLs in the attack spreadsheet
# Step 2: Read the "Citations" sheet and extract the "URL" column from each spreadsheet
mitre_all_fqdns = []

for path in group_paths:
    try:
        df = pd.read_excel(path, sheet_name='citations')['url'].dropna()
        print(f"{os.path.basename(path)}: {len(df)} URLs, {df.nunique()} unique URLs")

        for url in df:
            fqdn = urlparse(url).netloc
            if fqdn:
                mitre_all_fqdns.append(fqdn)
    except Exception as e:
        print(f"Error processing {path}: {e}")

print(f"Total unique FQDNs across all files: {len(set(mitre_all_fqdns))}")
#print(set(mitre_successful_fqdns))

enterprise-attack-v15.1-groups.xlsx: 1372 URLs, 905 unique URLs
mobile-attack-v15.1-groups.xlsx: 85 URLs, 74 unique URLs
ics-attack-v15.1-groups.xlsx: 159 URLs, 131 unique URLs
Total unique FQDNs across all files: 218


In [32]:
def compare_fqdns(malpedia_fqdn_and_dates: list, mitre_successful_fqdns: list):
    """
    Compares two sets of FQDNs, returning the overlap, A-B, B-A, and their counts.

    Args:
        fqdn_and_dates (list): A list of tuples containing FQDN and date.
        successful_fqdns (list): A list of FQDNs with successful download statuses.

    Returns:
        dict: A dictionary with the overlap, A-B, B-A, and counts for each.
    """
    # Extract unique FQDNs from both lists
    unique_fqdn_dates = set(fqdn for fqdn, date in malpedia_fqdn_and_dates)  # Unique FQDNs from fqdn_and_dates
    unique_successful_fqdns = set(mitre_successful_fqdns)  # Unique FQDNs from successful_fqdns

    # Find union
    union_set = unique_fqdn_dates.union(unique_successful_fqdns)
    
    # Find the overlap
    overlap = unique_fqdn_dates.intersection(unique_successful_fqdns)

    # Find A - B (FQDNs in fqdn_and_dates but not in successful_fqdns)
    a_minus_b = unique_fqdn_dates.difference(unique_successful_fqdns)

    # Find B - A (FQDNs in successful_fqdns but not in fqdn_and_dates)
    b_minus_a = unique_successful_fqdns.difference(unique_fqdn_dates)

    #Count the frequencies of FQDNs in Malpedia and MITRE lists
    malpedia_fqdn_counter = Counter(fqdn for fqdn, date in malpedia_fqdn_and_dates)
    mitre_fqdn_counter = Counter(mitre_successful_fqdns)

    # Get the top 5 FQDNs by frequency in Malpedia, MITRE, and overlap
    top_malpedia_fqdns = malpedia_fqdn_counter.most_common(10)
    top_mitre_fqdns = mitre_fqdn_counter.most_common(10)
 

    # Return the results along with the counts
    return {
        'union':union_set,
        'overlap': overlap,
        'A-B': a_minus_b,
        'B-A': b_minus_a,
        'union_count':len(union_set),
        'overlap_count': len(overlap),
        'A-B_count': len(a_minus_b),
        'B-A_count': len(b_minus_a),
        'top_malpedia_fqdns': top_malpedia_fqdns,
        'top_mitre_fqdns': top_mitre_fqdns,
    }

In [33]:
# Compare the FQDNs
comparison_results = compare_fqdns(malpedia_fqdn_and_dates, mitre_all_fqdns)

# Display the results
print(f"Union (A  U B) Count: {comparison_results['union_count']}")
print(f"Overlap (A ∩ B): {comparison_results['overlap']}, Count: {comparison_results['overlap_count']}")
print(f"A - B (In Malpedia but not in MITRE) Count: {comparison_results['A-B_count']}")
print(f"B - A (In MITRE but not in Malpedia): {comparison_results['B-A']}, Count: {comparison_results['B-A_count']}")

# Print the top FQDNs by frequency
print("\nTop FQDNs in Malpedia (by frequency):")
for fqdn, count in comparison_results['top_malpedia_fqdns']:
    print(f"{fqdn}: {count} times")

print("\nTop FQDNs in MITRE (by frequency):")
for fqdn, count in comparison_results['top_mitre_fqdns']:
    print(f"{fqdn}: {count} times")


Union (A  U B) Count: 2026
Overlap (A ∩ B): {'unit42.paloaltonetworks.com', 'paper.seebug.org', 'www.rapid7.com', 'www.rewterz.com', 'thedfirreport.com', 'blogs.blackberry.com', 'www.darkreading.com', 'blog.trendmicro.com', 'www.proofpoint.com', 'arstechnica.com', 'www.cert.ssi.gouv.fr', 'www.mandiant.com', 'www.slideshare.net', 'www.issuemakerslab.com', 'blog-assets.f-secure.com', 'cybleinc.com', 'www.anomali.com', 'www.group-ib.com', 'go.recordedfuture.com', 'www.gov.uk', 'www.wired.com', 'www.trustwave.com', 'www.trellix.com', 'www.uptycs.com', 'symantec-enterprise-blogs.security.com', 'www.f-secure.com', 'www.lookout.com', 'www.welivesecurity.com', 'msrc-blog.microsoft.com', 'blog.netlab.360.com', 'www.accenture.com', 'www.cylance.com', 'blog.checkpoint.com', 'reaqta.com', 'www.cybercom.mil', 'www2.fireeye.com', 'www.blackberry.com', 'www.cyfirma.com', 'www.leonardo.com', 'research.nccgroup.com', 'www.reuters.com', 'global.ahnlab.com', 'f.hubspotusercontent30.net', 'blogs.cisco.com

In [34]:
def extract_second_level_domain(fqdn):
    """
    Extracts the second-level domain (SLD) from a given FQDN.
    
    Args:
        fqdn (str): Fully Qualified Domain Name (FQDN).
        
    Returns:
        str: The second-level domain of the FQDN.
    """
    extracted = tldextract.extract(fqdn)
    return extracted.domain  # This gives you the second-level domain (SLD)


# Extract unique second-level domains from both lists
unique_second_level_malpedia = set(extract_second_level_domain(fqdn) for fqdn, date in malpedia_fqdn_and_dates)
unique_second_level_mitre = set(extract_second_level_domain(fqdn) for fqdn in mitre_all_fqdns)

# Find the intersection of second-level domains
overlap = unique_second_level_malpedia.intersection(unique_second_level_mitre)
b_minus_a = unique_second_level_mitre.difference(unique_second_level_malpedia)
# Print the result
print(f"Intersection of second-level domains: {overlap}")
print(f"Domains in MITRE but not in Malpedia: {b_minus_a}")

Intersection of second-level domains: {'welivesecurity', 'securityweek', 'operationblockbuster', 'justice', 'securityintelligence', 'threatconnect', 'f-secure', 'fox-it', 'therecord', 'trendmicro', 'dragos', 'certfa', 'security', 'lab52', 'twitter', 'scilabs', 'morphisec', 'fb', 'broadcom', 'qualys', 'securityaffairs', 'trellix', 'domaintools', 'lookout', 'ncsc', 'threatpost', 'noticeofpleadings', 'intezer', 'cisa', 'phishlabs', 'seebug', 'sans', 'pylos', 'cybleinc', 'bitdefender', 'wordpress', 'thehackernews', 'cylance', 'deepinstinct', 'eweek', 'aquasec', 'talent-jump', 'flashpoint-intel', 'forbes', 'sentinelone', 'geminiadvisory', 'blog', 'arcticwolf', 'bushidotoken', 'paloaltonetworks', 'us-cert', 'coresecurity', 'hubspot', 'jpcert', '360totalsecurity', 'ssi', 'dbappsecurity', 'whitehouse', 'att', 'sophos', 'redcanary', 'esentire', 'www', 'fsec', 'pwc', 'talosintelligence', 'ironnet', 'arstechnica', 'zdnet', 'kasperskycontenthub', 'objective-see', 'github', 'bleepingcomputer', 'ncc