In [22]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import json

In [23]:
# Cell 2: Load the threat-actor galaxy JSON
def load_threat_actor_galaxy(url: str) -> list:
    """
    Load the MISP Galaxy threat actor data.

    Parameters
    ----------
    url : str
        Raw GitHub URL to the threat-actor.json file

    Returns
    -------
    list
        List of threat actor clusters
    """
    resp = requests.get(url)
    data = resp.json()
    return data.get('values', [])


# Cell 3: Extract actor name + report URLs
def extract_actor_references(actors: list) -> list:
    """
    Extract references from each actor.

    Parameters
    ----------
    actors : list
        List of threat actor dicts

    Returns
    -------
    list
        List of dicts with actor name and references
    """
    results = []
    for actor in actors:
        name = actor.get('value')
        refs = actor.get('meta', {}).get('refs', []) + actor.get('meta', {}).get('references', [])
        results.append({
            'actor': name,
            'references': sorted(set(refs))
        })
    return results


# Cell 4: Run it
galaxy_url = "https://raw.githubusercontent.com/MISP/misp-galaxy/main/clusters/threat-actor.json"

actors = load_threat_actor_galaxy(galaxy_url)
actor_data = extract_actor_references(actors)

# Preview
print(f"Extracted {len(actor_data)} actors with reference links.")
for item in actor_data[:5]:
    print(json.dumps(item, indent=2))


# Cell 5: Save to JSON
with open("threat_actor_report_misp.json", "w", encoding="utf-8") as f:
    json.dump(actor_data, f, indent=2, ensure_ascii=False)


Extracted 857 actors with reference links.
{
  "actor": "APT1",
  "references": [
    "http://intelreport.mandiant.com/Mandiant_APT1_Report.pdf",
    "https://attack.mitre.org/groups/G0006/",
    "https://blog.trendmicro.com/trendlabs-security-intelligence/the-siesta-campaign-a-new-targeted-attack-awakens/",
    "https://community.broadcom.com/symantecenterprise/communities/community-home/librarydocuments/viewdocument?DocumentKey=f1265df5-6e5e-4fcc-9828-d4ddbbafd3d7&CommunityKey=1ecf5f55-9545-44d6-b0f4-4e4a7f5f5e68&tab=librarydocuments",
    "https://en.wikipedia.org/wiki/PLA_Unit_61398",
    "https://securingtomorrow.mcafee.com/other-blogs/mcafee-labs/operation-oceansalt-delivers-wave-after-wave/",
    "https://www.cfr.org/interactive/cyber-operations/pla-unit-61398",
    "https://www.fireeye.com/blog/threat-research/2014/03/a-detailed-examination-of-the-siesta-campaign.html",
    "https://www.fireeye.com/content/dam/fireeye-www/services/pdfs/mandiant-apt1-report.pdf",
    "https://ww

In [24]:
API_KEY = "496049f8c49bca7e56f746690699bae70ad00b26971dfedc0d0177f4397963a3"
HEADERS = {
    'X-OTX-API-KEY': API_KEY
}
BASE_URL = "https://otx.alienvault.com/api/v1"

In [25]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

def setup_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    return webdriver.Chrome(options=options)

def get_adversary_urls(driver, pages=1):
    adversary_urls = []

    for page in range(1, pages + 1):
        url = f"https://otx.alienvault.com/browse/global/adversaries?page={page}&limit=10&include_inactive=0&sort=-modified"
        driver.get(url)
        time.sleep(4)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        for a in soup.select("a[href^='/adversary/']"):
            href = a.get("href")
            if href and href.startswith("/adversary/"):
                full_url = f"https://otx.alienvault.com{href}"
                if full_url not in adversary_urls:
                    adversary_urls.append(full_url)

    return adversary_urls

def extract_pulse_urls(driver, adversary_url):
    driver.get(adversary_url)
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, "html.parser")

    pulse_urls = []
    for a in soup.select("a[href^='/pulse/']"):
        href = a.get("href")
        if href:
            pulse_urls.append(f"https://otx.alienvault.com{href}")

    return list(set(pulse_urls))  # Remove duplicates

def extract_references_from_pulse(driver, pulse_url):
    driver.get(pulse_url)
    time.sleep(4)
    soup = BeautifulSoup(driver.page_source, "html.parser")

    refs = []
    for a in soup.select("a[href^='http']"):
        text = a.get_text(strip=True).lower()
        if "report" in text or "source" in text or "read more" in text or "reference" in text:
            refs.append(a.get("href"))

    return refs

def main():
    driver = setup_driver()
    all_references = {}

    try:
        adversaries = get_adversary_urls(driver, pages=1)
        print(f"Found {len(adversaries)} adversaries.")

        for adv_url in adversaries:
            print(f"\nProcessing: {adv_url}")
            pulses = extract_pulse_urls(driver, adv_url)
            print(f"  Found {len(pulses)} pulses.")

            for pulse_url in pulses:
                print(f"    Pulse: {pulse_url}")
                refs = extract_references_from_pulse(driver, pulse_url)
                if refs:
                    all_references[pulse_url] = refs
                    for ref in refs:
                        print(f"      ➤ {ref}")
    finally:
        driver.quit()

    print("\nDone extracting threat report references.")
    return all_references

if __name__ == "__main__":
    refs = main()


Found 20 adversaries.

Processing: https://otx.alienvault.com/adversary/%20Stealth%20Mango%20and%20Tangelo%20
  Found 1 pulses.
    Pulse: https://otx.alienvault.com/pulse/create

Processing: https://otx.alienvault.com/adversary/ALLANITE
  Found 1 pulses.
    Pulse: https://otx.alienvault.com/pulse/create

Processing: https://otx.alienvault.com/adversary/ANDROMEDA%20SPIDER
  Found 1 pulses.
    Pulse: https://otx.alienvault.com/pulse/create

Processing: https://otx.alienvault.com/adversary/ANTHROPOID%20SPIDER
  Found 1 pulses.
    Pulse: https://otx.alienvault.com/pulse/create

Processing: https://otx.alienvault.com/adversary/APT%2016
  Found 21 pulses.
    Pulse: https://otx.alienvault.com/pulse/67d125d710400e52fc220c20
    Pulse: https://otx.alienvault.com/pulse/67d12c351f230e14a80688b1
    Pulse: https://otx.alienvault.com/pulse/67d12eba43e7f548d0ba3679
    Pulse: https://otx.alienvault.com/pulse/67d12ebb27ff2248db7568ae
    Pulse: https://otx.alienvault.com/pulse/create
    Pulse: 