In [8]:
import json
from urllib.parse import urlparse

def extract_fqdn_and_date(file_paths: list):
    """
    Extracts the FQDN from the 'url' field and the 'date' from the provided JSON lines
    in multiple files.

    Args:
        file_paths (list): List of paths to files containing JSON entries.

    Returns:
        list: A list of tuples containing the FQDN and date for each entry from all files.
    """
    results = []

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            # Read each line of the JSONL file
            for line in file:
                entry = json.loads(line)
                # Extract the FQDN from the 'url'
                url = entry.get("url", "")
                parsed_url = urlparse(url)
                fqdn = parsed_url.netloc  # Get the FQDN part from the URL

                # Extract the 'date'
                date = entry.get("date", "")

                # Append the FQDN and date to the results
                results.append((fqdn, date))

    return results

In [9]:
# File paths
file_path1 = r"C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\malpedia-db_2022-07-18_downloader.jsonl"
file_path2 = r"C:\Users\ricewater\Documents\CTIDownloads\20241204_malpedia_downloads\20241204_downloads.jsonl"

# Extract FQDNs and dates from both files
malpedia_fqdn_and_dates = extract_fqdn_and_date([file_path1, file_path2])

# Print the count of unique FQDNs across both files
print(len(set(fqdn for fqdn, date in malpedia_fqdn_and_dates)))


2002


In [10]:
import json
from urllib.parse import urlparse

def extract_successful_urls(file_path: str):
    """
    Extracts the FQDN from the 'url' field only for entries where the 'download_status' is 200 (successful).

    Args:
        file_path (str): Path to the file containing JSON entries.

    Returns:
        list: A list of FQDNs for entries with a successful download status.
    """
    successful_urls = []

    with open(file_path, 'r') as file:
        # Read each line of the JSONL file
        for line in file:
            entry = json.loads(line)

            # Check if 'download_status' is 200
            if entry.get('download_status') == 200:
                # Extract the FQDN from the 'url'
                url = entry.get("url", "")
                parsed_url = urlparse(url)
                fqdn = parsed_url.netloc  # Get the FQDN part from the URL

                # Append the FQDN to the list
                if fqdn:
                    successful_urls.append(fqdn)

    return successful_urls

In [14]:
# Example Usage for MITRE
file_path = r"C:\Users\ricewater\Documents\CTIDownloads\downloads\downloads\20241008_downloads.jsonl" 
mitre_successful_fqdns = extract_successful_urls(file_path)
print(len(set(mitre_successful_fqdns)))
# Display the results
#for fqdn in successful_fqdns:
#    print(f"FQDN: {fqdn}")

179


In [23]:
from collections import Counter

def compare_fqdns(malpedia_fqdn_and_dates: list, mitre_successful_fqdns: list):
    """
    Compares two sets of FQDNs, returning the overlap, A-B, B-A, and their counts.

    Args:
        fqdn_and_dates (list): A list of tuples containing FQDN and date.
        successful_fqdns (list): A list of FQDNs with successful download statuses.

    Returns:
        dict: A dictionary with the overlap, A-B, B-A, and counts for each.
    """
    # Extract unique FQDNs from both lists
    unique_fqdn_dates = set(fqdn for fqdn, date in malpedia_fqdn_and_dates)  # Unique FQDNs from fqdn_and_dates
    unique_successful_fqdns = set(mitre_successful_fqdns)  # Unique FQDNs from successful_fqdns

    # Find the overlap
    overlap = unique_fqdn_dates.intersection(unique_successful_fqdns)

    # Find A - B (FQDNs in fqdn_and_dates but not in successful_fqdns)
    a_minus_b = unique_fqdn_dates.difference(unique_successful_fqdns)

    # Find B - A (FQDNs in successful_fqdns but not in fqdn_and_dates)
    b_minus_a = unique_successful_fqdns.difference(unique_fqdn_dates)

    #Count the frequencies of FQDNs in Malpedia and MITRE lists
    malpedia_fqdn_counter = Counter(fqdn for fqdn, date in malpedia_fqdn_and_dates)
    mitre_fqdn_counter = Counter(mitre_successful_fqdns)

    # Get the top 5 FQDNs by frequency in Malpedia, MITRE, and overlap
    top_malpedia_fqdns = malpedia_fqdn_counter.most_common(10)
    top_mitre_fqdns = mitre_fqdn_counter.most_common(10)
 

    # Return the results along with the counts
    return {
        'overlap': overlap,
        'A-B': a_minus_b,
        'B-A': b_minus_a,
        'overlap_count': len(overlap),
        'A-B_count': len(a_minus_b),
        'B-A_count': len(b_minus_a),
        'top_malpedia_fqdns': top_malpedia_fqdns,
        'top_mitre_fqdns': top_mitre_fqdns,
    }


In [24]:
# Compare the FQDNs
comparison_results = compare_fqdns(malpedia_fqdn_and_dates, mitre_successful_fqdns)

# Display the results
print(f"Overlap (A ∩ B): {comparison_results['overlap']}, Count: {comparison_results['overlap_count']}")
print(f"A - B (In Malpedia but not in MITRE) Count: {comparison_results['A-B_count']}")
print(f"B - A (In MITRE but not in Malpedia): {comparison_results['B-A']}, Count: {comparison_results['B-A_count']}")

# Print the top FQDNs by frequency
print("\nTop FQDNs in Malpedia (by frequency):")
for fqdn, count in comparison_results['top_malpedia_fqdns']:
    print(f"{fqdn}: {count} times")

print("\nTop FQDNs in MITRE (by frequency):")
for fqdn, count in comparison_results['top_mitre_fqdns']:
    print(f"{fqdn}: {count} times")




Overlap (A ∩ B): {'www.recordedfuture.com', 'lab52.io', 'www.sentinelone.com', 'www.cybercom.mil', 'www.anomali.com', 'securityintelligence.com', 'attack.mitre.org', 'threatpost.com', 'www.forcepoint.com', 'blog.360totalsecurity.com', 'www.gov.uk', 'securingtomorrow.mcafee.com', 'www.esentire.com', 'www.ic3.gov', 'go.crowdstrike.com', 'msrc.microsoft.com', 'iranthreats.github.io', 'assets.sentinelone.com', 'www.cisa.gov', 'thedfirreport.com', 'www.mandiant.com', 'www.zscaler.com', 'unit42.paloaltonetworks.com', 'blog.qualys.com', 'blog.certfa.com', 'www.welivesecurity.com', 'msrc-blog.microsoft.com', 'citizenlab.ca', 'www.us-cert.gov', 'blog.netlab.360.com', 'docs.broadcom.com', 'www.blackberry.com', 'www.ironnet.com', 'www.microsoft.com', 'www.lacework.com', 'www.crowdstrike.com', 'cycraft.com', 'www.darkreading.com', 'info.phishlabs.com', 'www.dragos.com', 'cdn2.hubspot.net', 'arcticwolf.com', 'query.prod.cms.rt.microsoft.com', 'www.malwarebytes.com', 'news.sophos.com', 'www.wired.co

In [25]:
import tldextract

def extract_second_level_domain(fqdn):
    """
    Extracts the second-level domain (SLD) from a given FQDN.
    
    Args:
        fqdn (str): Fully Qualified Domain Name (FQDN).
        
    Returns:
        str: The second-level domain of the FQDN.
    """
    extracted = tldextract.extract(fqdn)
    return extracted.domain  # This gives you the second-level domain (SLD)


# Extract unique second-level domains from both lists
unique_second_level_malpedia = set(extract_second_level_domain(fqdn) for fqdn, date in malpedia_fqdn_and_dates)
unique_second_level_mitre = set(extract_second_level_domain(fqdn) for fqdn in mitre_successful_fqdns)

# Find the intersection of second-level domains
overlap = unique_second_level_malpedia.intersection(unique_second_level_mitre)
b_minus_a = unique_second_level_mitre.difference(unique_second_level_malpedia)
# Print the result
print(f"Intersection of second-level domains: {overlap}")
print(f"Domains in MITRE but not in Malpedia: {b_minus_a}")

Intersection of second-level domains: {'noticeofpleadings', 'paloaltonetworks', 'bbc', 'cylance', 'pwc', 'sygnia', 'f-secure', 'jpcert', 'cybereason', 'mandiant', 'defense', 'citizenlab', 'symantec', 'malwarebytes', 'forbes', 'lookout', 'github', 'issuemakerslab', 'cycraft', 'dragos', 'archive', 'nttsecurity', 'bitdefender', 'thehackernews', '360', 'rewterz', 'darkreading', 'att', 'blog', 'deepinstinct', 'infosecurity-magazine', 'welivesecurity', 'kaspersky', 'logrhythm', '360totalsecurity', 'trustwave', 'lab52', 'certfa', 'arstechnica', 'twitter', 'trendmicro', 'rsa', 'recordedfuture', 'youtube', 'morphisec', 'whitehouse', 'kasperskycontenthub', 'scilabs', 'secureworks', 'ncsc', 'broadcom', 'europa', 'securityintelligence', 'uptycs', 'checkpoint', 'virusbulletin', 'brighttalk', 'flashpoint-intel', 'ssi', 'wordpress', 'eweek', 'fox-it', 'securelist', 'zscaler', 'vblocalhost', 'esentire', 'clearskysec', 'hubspotusercontent30', 'fireeye', 'justice', 'mcafee', 'cybercom', 'sentinelone', '