In [24]:
import json
from urllib.parse import urlparse

def extract_fqdn_and_date(file_path: str):
    """
    Extracts the FQDN from the 'url' field and the 'date' from the provided JSON lines.

    Args:
        file_path (str): Path to the file containing JSON entries.

    Returns:
        list: A list of tuples containing the FQDN and date for each entry.
    """
    results = []

    with open(file_path, 'r') as file:
        # Read each line of the JSONL file
        for line in file:
            entry = json.loads(line)
            # Extract the FQDN from the 'url'
            url = entry.get("url", "")
            parsed_url = urlparse(url)
            fqdn = parsed_url.netloc  # Get the FQDN part from the URL

            # Extract the 'date'
            date = entry.get("date", "")

            # Append the FQDN and date to the results
            results.append((fqdn, date))

    return results

In [25]:
# Example Usage
file_path = r"C:\Users\Aakanksha Saha\Documents\CTI_downloads\malpedia_20220718\malpedia_20220718\malpedia-db_2022-07-18_downloader.jsonl"  # Replace with the actual file path
malpedia_fqdn_and_dates = extract_fqdn_and_date(file_path)
print(len(set(fqdn for fqdn, date in malpedia_fqdn_and_dates)))
# Display the results
#for fqdn, date in fqdn_and_dates:
#    print(f"FQDN: {fqdn}, Date: {date}")


1666


In [22]:
import json
from urllib.parse import urlparse

def extract_successful_urls(file_path: str):
    """
    Extracts the FQDN from the 'url' field only for entries where the 'download_status' is 200 (successful).

    Args:
        file_path (str): Path to the file containing JSON entries.

    Returns:
        list: A list of FQDNs for entries with a successful download status.
    """
    successful_urls = []

    with open(file_path, 'r') as file:
        # Read each line of the JSONL file
        for line in file:
            entry = json.loads(line)

            # Check if 'download_status' is 200
            if entry.get('download_status') == 200:
                # Extract the FQDN from the 'url'
                url = entry.get("url", "")
                parsed_url = urlparse(url)
                fqdn = parsed_url.netloc  # Get the FQDN part from the URL

                # Append the FQDN to the list
                if fqdn:
                    successful_urls.append(fqdn)

    return successful_urls

In [23]:
# Example Usage
file_path = r"C:\Users\Aakanksha Saha\Documents\CTI_downloads\downloads\20241008_downloads.jsonl"  # Replace with the actual file path
mitre_successful_fqdns = extract_successful_urls(file_path)
print(len(set(mitre_successful_fqdns)))
# Display the results
#for fqdn in successful_fqdns:
#    print(f"FQDN: {fqdn}")

179


In [28]:
def compare_fqdns(malpedia_fqdn_and_dates: list, mitre_successful_fqdns: list):
    """
    Compares two sets of FQDNs, returning the overlap, A-B, B-A, and their counts.

    Args:
        fqdn_and_dates (list): A list of tuples containing FQDN and date.
        successful_fqdns (list): A list of FQDNs with successful download statuses.

    Returns:
        dict: A dictionary with the overlap, A-B, B-A, and counts for each.
    """
    # Extract unique FQDNs from both lists
    unique_fqdn_dates = set(fqdn for fqdn, date in malpedia_fqdn_and_dates)  # Unique FQDNs from fqdn_and_dates
    unique_successful_fqdns = set(mitre_successful_fqdns)  # Unique FQDNs from successful_fqdns

    # Find the overlap
    overlap = unique_fqdn_dates.intersection(unique_successful_fqdns)

    # Find A - B (FQDNs in fqdn_and_dates but not in successful_fqdns)
    a_minus_b = unique_fqdn_dates.difference(unique_successful_fqdns)

    # Find B - A (FQDNs in successful_fqdns but not in fqdn_and_dates)
    b_minus_a = unique_successful_fqdns.difference(unique_fqdn_dates)

    # Return the results along with the counts
    return {
        'overlap': overlap,
        'A-B': a_minus_b,
        'B-A': b_minus_a,
        'overlap_count': len(overlap),
        'A-B_count': len(a_minus_b),
        'B-A_count': len(b_minus_a),
    }


In [29]:
# Compare the FQDNs
comparison_results = compare_fqdns(malpedia_fqdn_and_dates, mitre_successful_fqdns)

# Display the results
print(f"Overlap (A ∩ B): {comparison_results['overlap']}, Count: {comparison_results['overlap_count']}")
#print(f"Overlap (A ∩ B) Count: {comparison_results['overlap_count']}")
#print(f"A - B (In malpedia but not in MITRE): {comparison_results['A-B']}, Count: {comparison_results['A-B_count']}")
print(f"A - B (In malpedia but not in MITRE) Count: {comparison_results['A-B_count']}")
print(f"B - A (In MITRE but not in Malpedia): {comparison_results['B-A']}, Count: {comparison_results['B-A_count']}")


Overlap (A ∩ B): {'www.trendmicro.de', 'citizenlab.ca', 'query.prod.cms.rt.microsoft.com', 'blog.checkpoint.com', 'www.rapid7.com', 'businessinsights.bitdefender.com', 'adversary.crowdstrike.com', 'blog.morphisec.com', 'www.fox-it.com', 'blog.qualys.com', 'www.trustwave.com', 'www.cybercom.mil', 'blog.talosintelligence.com', 'www.cylance.com', 'www.slideshare.net', 'www.lacework.com', 'blogs.microsoft.com', 'blog.certfa.com', 'www.ironnet.com', 'blog.aquasec.com', 'www.proofpoint.com', 'www.recordedfuture.com', 'blog.malwarebytes.com', 'www.talent-jump.com', 'www.gov.uk', 'documents.trendmicro.com', 'noticeofpleadings.com', 'www.youtube.com', 'www.symantec.com', 'securityaffairs.co', 'home.treasury.gov', 'threatpost.com', 'unit42.paloaltonetworks.com', 'msrc.microsoft.com', 'blogs.cisco.com', 'thehackernews.com', 'www.bbc.com', 'cybleinc.com', 'cdn0.vox-cdn.com', 'www.us-cert.gov', 'www.forcepoint.com', 'blog-assets.f-secure.com', 'download.bitdefender.com', 'community.broadcom.com', '