In [1]:
import json
import requests
import re

In [2]:
def extract_urls_from_bib(file_path):
    """
    Extracts all URL entries from a .bib (BibTeX) file.

    Parameters
    ----------
    file_path : str
        Path to the .bib file.

    Returns
    -------
    list
        List of extracted URL strings.
    """
    urls = []

    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

        # Regex to match lines like: url = {https://...}
        url_matches = re.findall(r'url\s*=\s*{([^}]+)}', content)

        urls.extend(url_matches)

    return urls


def save_urls_to_file(urls, output_path):
    """
    Saves a list of URLs to a text file, one per line.

    Parameters
    ----------
    urls : list
        List of URL strings.

    output_path : str
        Path to output .txt file.
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        for url in urls:
            f.write(url + '\n')


# Example usage
if __name__ == "__main__":
    bib_path = r"C:\Users\ricewater\Downloads\malpedia.bib"         
    output_path = r"C:\Users\ricewater\Downloads\malpedia_urls_2025.txt" # Output text file

    urls = extract_urls_from_bib(bib_path)
    #save_urls_to_file(urls, output_path)

    print(f"Extracted {len(urls)} URLs and saved to '{output_path}'")


Extracted 17242 URLs and saved to 'C:\Users\ricewater\Downloads\malpedia_urls_2025.txt'


In [3]:
def merge_jsonl(source_file, target_file):
    """
    Appends the contents of one JSONL file to another.

    Args:
        source_file (str): Path to the source JSONL file whose content will be appended.
        target_file (str): Path to the target JSONL file to which data will be appended.
    """
    with open(source_file, 'r', encoding='utf-8') as src, open(target_file, 'a', encoding='utf-8') as tgt:
        for line in src:
            tgt.write(line)

    print(f"Contents of '{source_file}' have been appended to '{target_file}'.")


# Example usage
#merge_jsonl(r"C:\Users\ricewater\Documents\CTIDownloads\malpedia_20220718\malpedia_20220718\malpedia-db_2022-07-18_downloader.jsonl", r"C:\Users\ricewater\Documents\CTIDownloads\downloads\downloads\20241008_downloads.jsonl")


In [4]:
file_path_malpedia_2022 = r"C:\Users\ricewater\Documents\CTITTP\group_profile_analysis\data\malpedia-db_2022-07-18_downloader.jsonl"

In [5]:
file_path_malpedia_2024 = r"C:\Users\ricewater\Documents\CTITTP\group_profile_analysis\data\20241008_downloads.jsonl"

In [3]:
def read_urls_from_file(filepath):
    """Reads URLs from a text file and returns a set of unique URLs."""
    with open(filepath, 'r', encoding='utf-8') as file:
        return set(line.strip() for line in file if line.strip())

In [23]:
# Paths to your two text files
file1 = r"C:\Users\ricewater\Documents\CTIDownloads\downloads\downloads\20240930_urls.txt"
file2 = r"C:\Users\ricewater\Documents\CTIDownloads\downloads\downloads\mobile-ics\mobile-ics\urlsfromMobileandICS.txt"

# Read and combine URLs
urls_file1 = read_urls_from_file(file1)
urls_file2 = read_urls_from_file(file2)

all_unique_urls_attack = urls_file1.union(urls_file2)

print(f"Unique URLs in file 1: {len(urls_file1)}")
print(f"Unique URLs in file 2: {len(urls_file2)}")
print(f"Total unique URLs across both files: {len(all_unique_urls_attack)}")

Unique URLs in file 1: 905
Unique URLs in file 2: 25
Total unique URLs across both files: 930


In [46]:
all_urls_malpedia = r"C:\Users\ricewater\Documents\CTIDownloads\all_unique_urls_malpedia.txt"
urls_file_malpedia = read_urls_from_file(all_urls_malpedia)

In [47]:
len(urls_file_malpedia)

15699

In [48]:
len(all_unique_urls_attack.union(urls_file_malpedia))

16107

In [24]:
all_urls_malpedia_2025 = r"C:\Users\ricewater\Downloads\malpedia_urls_2025.txt"
urls_file_malpedia_2025 = read_urls_from_file(all_urls_malpedia_2025)
len(urls_file_malpedia_2025)

17233

In [25]:
# Load the MISP threat actor references JSON file
with open("threat_actor_report_misp.json", "r", encoding="utf-8") as f:
    misp_data = json.load(f)

# Extract all URLs from the references
misp_urls = set()
for actor in misp_data:
    misp_urls.update(actor.get("references", []))


# Compute union
all_combined_urls = misp_urls.union(urls_file_malpedia_2025)

# Print result
print(f"MISP URLs: {len(misp_urls)}")
print(f"Combined total URLs: {len(all_combined_urls)}")


MISP URLs: 2660
Combined total URLs: 17462


In [26]:
# Step 4: Compute intersections
#intersect_attack = misp_urls.intersection(all_unique_urls_attack)
intersect_malpedia = misp_urls.intersection(urls_file_malpedia_2025)


# Step 5: Output results
#print(f"Intersection with attack source: {len(intersect_attack)} URLs")
print(f"Intersection with malpedia source: {len(intersect_malpedia)} URLs")


Intersection with malpedia source: 2431 URLs


In [27]:
len(misp_urls)

2660

In [28]:
# Step 1: Union of attack and malpedia sources
#non_misp_union = all_unique_urls_attack.union(urls_file_malpedia)
#print(f"Union of attack & malpedia sources: {len(non_misp_union)} URLs")

# Step 2: Intersection with MISP
misp_intersection = misp_urls.intersection(urls_file_malpedia_2025)
print(f"URLs in both MISP and malpedia): {len(misp_intersection)}")

# Step 3: URLs in MISP but not in either attack or malpedia
misp_only = misp_urls.difference(urls_file_malpedia_2025)
print(f"URLs unique to MISP: {len(misp_only)}")


print("\nExample of URLs only in MISP:")
for url in list(misp_only)[:20]:
    print(" ", url)


URLs in both MISP and malpedia): 2431
URLs unique to MISP: 229

Example of URLs only in MISP:
  http://www.ptsecurity.com/upload/corporate/ww-en/analytics/APT-Attacks-eng.pdf
  https://www.ptsecurity.com/ww-en/analytics/pt-esc-threat-intelligence/hellhounds-operation-lahat-part-2/
  https://www.redpacketsecurity.com/wildpressure-targets-industrial-related-entities-in-the-middle-east/
  https://www.cisa.gov/news-events/cybersecurity-advisories/aa22-152a
  https://www.dragos.com/blog/responding-to-chernovites-pipedream-with-dragos-global-services/
  https://www.rewterz.com/rewterz-news/rewterz-threat-alert-new-ransomware-actor-oldgremlin-hits-multiple-organizations
  https://www.fortinet.com/blog/threat-research/guidance-on-hacktivist-operation-opspatuk-by-dragonforce
  https://www.mandiant.com/resources/blog/investigating-ivanti-exploitation-persistence
  https://www.securityweek.com/palestinian-hackers-hit-100-israeli-organizations-in-destructive-attacks/
  https://cert.gov.ua/article/

In [29]:
# Make sure these are already defined:
# misp_urls: set of URLs from MISP
# non_misp_union: set of all your other known URLs (e.g., attack ∪ malpedia)

# Intersection
misp_intersection = misp_urls.intersection(urls_file_malpedia_2025)

# Percentage of MISP URLs that are already in your dataset
if misp_urls:
    percentage_overlap = (len(misp_intersection) / len(misp_urls)) * 100
else:
    percentage_overlap = 0.0

print(f"{len(misp_intersection)} out of {len(misp_urls)} MISP URLs are already in your dataset.")
print(f"That’s approximately {percentage_overlap:.2f}% overlap.")


2431 out of 2660 MISP URLs are already in your dataset.
That’s approximately 91.39% overlap.


In [18]:
def extract_fqdns(urls: set) -> set:
    """
    Extract domains (FQDNs) from a set of URLs.

    Parameters
    ----------
    urls : set
        A set of full URLs

    Returns
    -------
    set
        A set of unique FQDNs (domains)
    """
    domains = set()
    for url in urls:
        try:
            parsed = urlparse(url)
            if parsed.netloc:
                domains.add(parsed.netloc.lower())
        except Exception:
            continue
    return domains


# Step 1: Extract FQDNs from each source
misp_fqdns = extract_fqdns(misp_urls)
#attack_fqdns = extract_fqdns(all_unique_urls_attack)
malpedia_fqdns = extract_fqdns(urls_file_malpedia_2025)

# Step 2: Union of other sources
#non_misp_fqdns = attack_fqdns.union(malpedia_fqdns)

# Step 3: Intersection and difference
fqdns_intersection = misp_fqdns.intersection(malpedia_fqdns)
fqdns_unique_to_misp = misp_fqdns.difference(malpedia_fqdns)

# Step 4: Coverage percentage
if misp_fqdns:
    fqdn_coverage = (len(fqdns_intersection) / len(misp_fqdns)) * 100
else:
    fqdn_coverage = 0.0

# Step 5: Output results
print(f"Total unique FQDNs in MISP: {len(misp_fqdns)}")
print(f"FQDNs also in your dataset: {len(fqdns_intersection)}")
print(f"FQDNs unique to MISP: {len(fqdns_unique_to_misp)}")
print(f"FQDN coverage: {fqdn_coverage:.2f}%")

Total unique FQDNs in MISP: 549
FQDNs also in your dataset: 527
FQDNs unique to MISP: 22
FQDN coverage: 95.99%


In [21]:
len(misp_fqdns),  len(malpedia_fqdns)

(549, 2174)

In [22]:
fqdns_unique_to_misp

{'darknetlive.com',
 'dev.ua',
 'files.truesec.com',
 'forescoutstage.wpengine.com',
 'internal-www.fireeye.com',
 'mybroadband.co.za',
 'nsi-globalcounterintelligence.com',
 'packetstormsecurity.com',
 'securitybrief.asia',
 'static.fortra.com',
 'thecyberwire.com',
 'titaniam.io',
 'www.chronline.com',
 'www.cybersecurityintelligence.com',
 'www.facct.ru',
 'www.ibtimes.com',
 'www.itpro.com',
 'www.loginradius.com',
 'www.newslocker.com',
 'www.phishlabs.com',
 'www.privacyaffairs.com',
 'zimpstage.wpengine.com'}

In [49]:
len(all_unique_urls_attack.intersection(urls_file_malpedia))

522

In [53]:
unique_url_malpedia = r"group_profile_analysis/malpedia_api_responses/malpedia_unique_actor_url.txt"

In [54]:
unique_urls_file_malpedia = read_urls_from_file(unique_url_malpedia)

In [55]:
len(unique_urls_file_malpedia)

2443

In [56]:
len(all_unique_urls_attack.union(unique_urls_file_malpedia))

3160

In [42]:

# URL of the MITRE ATT&CK Groups page
#url = "https://attack.mitre.org/groups/"
url = "https://attack.mitre.org/versions/v15/groups/"

# Send a GET request to fetch the page content
response = requests.get(url)
if response.status_code == 200:
    # Use regular expression to find all group IDs (e.g., G0001, G1000)
    group_ids = re.findall(r'G\d{4}', response.text)
    unique_group_ids = sorted(set(group_ids))
    
    print(f"Total groups found: {len(unique_group_ids)}")
    #print("Group IDs:")
    #for gid in unique_group_ids:
    #    print(gid)
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


Total groups found: 152
