# MITRE Source Knowledge
Author:
- Leo Szalkowski (@sk1s3c)

References:
- [Demystifying Generative AI](https://blog.openthreatresearch.com/demystifying-generative-ai-a-security-researchers-notes/)
- [AttackCTI Python](https://attackcti.com/intro.html)



In [1]:
# Install attackcti & other libraries
%pip install attackcti -q
%pip install jinja2 -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from attackcti import attack_client
from jinja2 import Environment, FileSystemLoader
from collections import defaultdict
import os
import re

### Create Directories

In [3]:
current_dir = os.path.dirname("__file__")
docs_dir = os.path.join(current_dir, "knowledge")
# Create subfolders
os.makedirs(os.path.join(docs_dir, 'tactics'), exist_ok=True)
os.makedirs(os.path.join(docs_dir, 'techniques'), exist_ok=True)
os.makedirs(os.path.join(docs_dir, 'groups'), exist_ok=True)

templates_dir = os.path.join(current_dir, "templates")

# Set up Jinja2 environment
env = Environment(loader=FileSystemLoader(templates_dir),
    trim_blocks=True,
    lstrip_blocks=True
)

### Initialize ATT&CK Client

In [4]:
# Initialize the ATT&CK client
lift = attack_client()

### Get ATT&CK Data

This may take a few minutes to run.

In [7]:
tactics = lift.get_enterprise_tactics(stix_format=False)
techniques = lift.get_enterprise_techniques(stix_format=False)
groups = lift.get_enterprise_groups(stix_format=False)
mitigations = lift.get_enterprise_mitigations(stix_format=False)
data_sources = lift.get_enterprise_data_sources(stix_format=False)
data_components = lift.get_enterprise_data_components(stix_format=False)
relationships = lift.get_enterprise_relationships(stix_format=False)

### Create Mappings and Pre-Process Details

In [84]:
# --- Helper Functions ---
def mitre_link_replacer(match):
    link_text = match.group(1)
    url = match.group(2)

    # Pattern for MITRE Techniques: /techniques/Txxxx or /techniques/Txxxx/xxx
    tech_match = re.search(r'/techniques/(T\d{4})(?:/(\d{3}))?/?$', url)
    if tech_match:
        base_id = tech_match.group(1)
        sub_id = tech_match.group(2)
        full_id = f"{base_id}.{sub_id}" if sub_id else base_id
        return f"{link_text} ({full_id})"

    # Pattern for other MITRE entities
    other_match = re.search(r'/(?:tactics|groups|mitigations|datasources)/([A-Z]{1,2}\d{4})/?$', url)
    if other_match:
        entity_id = other_match.group(1)
        return f"{link_text} ({entity_id})"

    # If it's not a recognized MITRE entity URL, return just the link text
    return link_text

def process_description(description_text):
    if not isinstance(description_text, str) or not description_text:
        return ""

    processed_text = description_text
    
    # 1. Convert specific MITRE markdown links using the replacer function
    # Pattern to find markdown links: [text](url)
    markdown_link_pattern = r'\[([^\]]+?)\]\((https?://attack.mitre.org/[^)]+)\)'
    processed_text = re.sub(markdown_link_pattern, mitre_link_replacer, processed_text)

    # 2. Remove (Citation: ...) patterns
    citation_pattern = r'\(\s*Citation:[^)]+\)'
    processed_text = re.sub(citation_pattern, '', processed_text)

    # 3. Replace <code>HTML tags</code> with `markdown backticks`
    code_tag_pattern = r'<code>(.*?)</code>'
    processed_text = re.sub(code_tag_pattern, r'```\1```', processed_text, flags=re.DOTALL)

    return processed_text

In [60]:
# --- STIX ID to ATT&CK ID Mappings ---
technique_stix_to_attack = {
    tech['id']: tech['technique_id']
    for tech in techniques
    if 'id' in tech and 'technique_id' in tech
}
group_stix_to_attack = {
    group['id']: group['group_id'] 
    for group in groups 
    if 'id' in group and 'group_id' in group
}
mitigation_stix_to_attack = {
    mit['id']: mit['mitigation_id'] 
    for mit in mitigations 
    if 'id' in mit and 'mitigation_id' in mit
}


In [61]:
# --- STIX ID to Name Mappings ---
# These are useful for looking up names when we have STIX IDs from relationships
technique_stix_id_to_name = {
    tech['id']: tech['technique'] 
    for tech in techniques 
    if 'id' in tech and 'technique' in tech
}
group_stix_id_to_name = {
    group['id']: group['group'] 
    for group in groups 
    if 'id' in group and 'group' in group
}
mitigation_stix_id_to_name = {
    mit['id']: mit['mitigation'] 
    for mit in mitigations 
    if 'id' in mit and 'mitigation' in mit
}

In [62]:
# --- Tactic & Technique Mappings ---
tactic_shortname_to_name = {
    tactic.get('tactic_shortname'): tactic['tactic'] 
    for tactic in tactics 
    if 'tactic_shortname' in tactic and 'tactic' in tactic
}
tactic_shortname_to_id = {
    tactic.get('tactic_shortname'): tactic['tactic_id'] 
    for tactic in tactics 
    if 'tactic_shortname' in tactic and 'tactic_id' in tactic
}


In [63]:
# --- ATT&CK ID to Name Mapping (for Techniques) ---
# Useful for looking up names when we have ATT&CK IDs (e.g., parent technique ID)
technique_attack_id_to_name = {
    tech['technique_id']: tech['technique'] 
    for tech in techniques 
    if 'technique_id' in tech and 'technique' in tech
}

In [64]:
# --- Parent/Sub-technique Relationship Mappings (using ATT&CK IDs and names) ---
parent_info_for_subtechnique = {}  # Stores parent {id, name} for each sub-technique ATT&CK ID
subtechniques_for_parent = defaultdict(list)  # Stores list of sub-technique {id, name} for each parent ATT&CK ID

for tech in techniques:
    # Check if it's a sub-technique using 'is_subtechnique' (boolean field from attackcti with stix_format=False)
    # or by checking if the technique_id contains a '.'
    is_sub = tech.get('is_subtechnique', False) 
    current_attack_id = tech.get('technique_id')
    if is_sub and current_attack_id and '.' in current_attack_id:
        sub_attack_id = current_attack_id
        sub_name = tech.get('technique', 'Unknown Sub-technique')
        
        # Derive parent ATT&CK ID by removing the sub-technique part (e.g., T1055.011 -> T1055)
        parent_attack_id = sub_attack_id.split('.')[0]
        
        if parent_attack_id:
            # Get the parent's name using the technique_attack_id_to_name mapping
            parent_name = technique_attack_id_to_name.get(parent_attack_id)

            if parent_name:
                # For the current sub-technique, store its parent's info
                parent_info_for_subtechnique[sub_attack_id] = {'id': parent_attack_id, 'name': parent_name}
                
                # For the parent technique, add this sub-technique to its list
                subtechniques_for_parent[parent_attack_id].append({'id': sub_attack_id, 'name': sub_name})


In [65]:
# --- Data Source Mappings (from data_sources list) ---
datasource_stix_id_to_name = {}
datasource_stix_id_to_attack_id = {} # For DSxxxx

for ds in data_sources:
    ds_stix_id = ds.get('id')
    ds_name = ds.get('data_source')
    for ref in ds['external_references']:
        if ref.get('source_name') == 'mitre-attack':
            ds_attack_id_val = ref.get('external_id')
            break
    if ds_stix_id and ds_name:
        datasource_stix_id_to_name[ds_stix_id] = ds_name
        if ds_attack_id_val:
            datasource_stix_id_to_attack_id[ds_stix_id] = ds_attack_id_val


In [66]:
# --- Data Component Mappings (from data_components list) ---
datacomponent_stix_id_to_name = {}
datacomponent_stix_id_to_parent_datasource_stix_id = {} # Maps component STIX ID to its parent DS STIX ID

for dc_obj in data_components: # from lift.get_enterprise_data_components()
    dc_stix_id = dc_obj.get('id') # This is the STIX ID used in relationship's source_object
    dc_name = dc_obj.get('data_component')
    # How does dc_obj link to its parent Data Source?
    # Assumption: dc_obj has a field like 'x_mitre_data_source_ref' containing the parent DS STIX ID
    parent_ds_stix_id = dc_obj.get('data_source') # <--- VERIFY THIS FIELD NAME

    if dc_stix_id and dc_name:
        datacomponent_stix_id_to_name[dc_stix_id] = dc_name
        if parent_ds_stix_id:
            datacomponent_stix_id_to_parent_datasource_stix_id[dc_stix_id] = parent_ds_stix_id

In [67]:
# --- Pre-process Relationships for Specific Detection Details ---
detection_details_by_technique_stix_id = defaultdict(list)

# What is the field name for relationship type? 'relationship' or 'relationship_type'?
# What is the field name for source object STIX ID? 'source_object' or 'source_ref'?
# What is the field name for target object STIX ID? 'target_object' or 'target_ref'?
# You confirmed 'relationship' == 'detects'

for rel in relationships:
    relationship_type_val = rel.get('relationship') # Using 'relationship' as per your confirmation
    source_obj_stix_id = rel.get('source_object')   # Using 'source_object'
    target_obj_stix_id = rel.get('target_object')   # Using 'target_object'
    rel_description = rel.get('relationship_description', '')

    # Check if target is a technique and source is a data component (based on its STIX ID prefix)
    # What STIX ID prefix do Data Components have? It might be 'x-mitre-data-component--UUID' or just 'data-component--UUID'.
    # You need to verify this by looking at `source_obj_stix_id` for these 'detects' relationships.
    is_target_technique = target_obj_stix_id and target_obj_stix_id.startswith('attack-pattern--')
    # Example prefix for data components - VERIFY THIS!
    is_source_datacomponent = source_obj_stix_id and source_obj_stix_id.startswith('x-mitre-data-component--') 

    if relationship_type_val == 'detects' and \
       is_target_technique and \
       is_source_datacomponent and \
       rel_description:
        
        technique_stix_id = target_obj_stix_id
        component_stix_id = source_obj_stix_id
        
        component_name = datacomponent_stix_id_to_name.get(component_stix_id, "Unknown Component")
        parent_ds_stix_id_for_component = datacomponent_stix_id_to_parent_datasource_stix_id.get(component_stix_id)
        
        ds_name_for_display = "Unknown Data Source"
        ds_attack_id_for_display = "" # DSxxxx

        if parent_ds_stix_id_for_component:
            ds_name_for_display = datasource_stix_id_to_name.get(parent_ds_stix_id_for_component, ds_name_for_display)
            ds_attack_id_for_display = datasource_stix_id_to_attack_id.get(parent_ds_stix_id_for_component, "")
        
        display_source_name = f"{ds_name_for_display}: {component_name}"

        detection_details_by_technique_stix_id[technique_stix_id].append({
            'source_name': display_source_name,
            'source_id': ds_attack_id_for_display, # This is the DSxxxx of the parent Data Source
            'description': process_description(rel_description)
        })

### Generate Markdown Knowledge Files

In [68]:
# --- Generate markdown files for Tactics ---
tactic_template = env.get_template('tactic.md.j2')

for tactic in tactics:
    tactic_attack_id = tactic['tactic_id']
    tactic_name = tactic['tactic']
    tactic_description = tactic.get('tactic_description')
    tactic_shortname = tactic.get('tactic_shortname', '') # Used to find related techniques

    related_techniques_data = []
    for tech in techniques:
        # A technique's 'tactic' field is expected to be a list of shortnames
        if tactic_shortname in tech.get('tactic', []): 
            related_techniques_data.append({'id': tech['technique_id'], 'name': tech['technique']})
            
    tactic_data = {
        'id': tactic_attack_id,
        'name': tactic_name,
        'description': process_description(tactic_description),
        'related_techniques': related_techniques_data
    }
    
    md_content = tactic_template.render(tactic=tactic_data)
    filename = os.path.join(docs_dir, 'tactics', f"{tactic_attack_id}.md")
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(md_content)

In [86]:
# --- Generate markdown files for Techniques ---
technique_template = env.get_template('technique.md.j2')

for technique in techniques:
    tech_attack_id = technique['technique_id']
    tech_name = technique['technique']
    tech_description = technique.get('technique_description')
    tech_stix_id = technique.get('id') # STIX ID is needed for finding relationships

    # Tactics
    tactics_info = []
    if 'tactic' in technique and technique['tactic']: # Check if 'tactic' key exists and is not None/empty
        for tactic_shortname in technique['tactic']:
            if tactic_shortname in tactic_shortname_to_id and tactic_shortname in tactic_shortname_to_name:
                tactics_info.append({
                    'id': tactic_shortname_to_id[tactic_shortname], 
                    'name': tactic_shortname_to_name[tactic_shortname]
                })
    
    # Related Groups (Groups that USE this technique)
    # Relationships: Group (source_object) --uses--> Technique (target_object)
    related_groups_rels = [
        rel for rel in relationships 
        if rel.get('target_object') == tech_stix_id and \
           rel.get('relationship') == 'uses' and \
           rel.get('source_object','').startswith('intrusion-set--')
    ]
    groups_info = [{
            'id': group_stix_to_attack.get(rel['source_object']), 
            'name': group_stix_id_to_name.get(rel['source_object'])
        } 
        for rel in related_groups_rels 
        if rel.get('source_object') in group_stix_to_attack and rel.get('source_object') in group_stix_id_to_name
    ]
    groups_info = [g for g in groups_info if g.get('id') and g.get('name')]

    # Mitigations (Mitigations that MITIGATE this technique)
    # Relationships: Mitigation (source_object) --mitigates--> Technique (target_object)
    mitigations_rels = [
        rel for rel in relationships 
        if rel.get('target_object') == tech_stix_id and \
           rel.get('relationship') == 'mitigates' and \
           rel.get('source_object','').startswith('course-of-action--')
    ]
    mitigations_info = [{
            'id': mitigation_stix_to_attack.get(rel['source_object']), 
            'name': mitigation_stix_id_to_name.get(rel['source_object'])
        } 
        for rel in mitigations_rels 
        if rel.get('source_object') in mitigation_stix_to_attack and rel.get('source_object') in mitigation_stix_id_to_name
    ]
    mitigations_info = [m for m in mitigations_info if m.get('id') and m.get('name')]
    
    # Parent/Sub-technique info
    parent_technique_info = parent_info_for_subtechnique.get(tech_attack_id) # Is None if not a sub-technique or parent not found
    current_subtechniques = subtechniques_for_parent.get(tech_attack_id, []) # Is empty list if no sub-techniques
    
    # Data Sources (for detection info)
    # Format: ["Data Source: Data Component", "Data Source 2: Data Component 2"]
    data_sources_info = technique.get('data_sources', [])
    general_detection_text = technique.get('technique_detection')
    # Get specific detection details from the pre-processed relationships
    specific_detection_narratives = detection_details_by_technique_stix_id.get(tech_stix_id, [])
    if specific_detection_narratives:
        for sdn in specific_detection_narratives:
            sdn['description'] = process_description(sdn['description'])
    technique_data = {
        'id': tech_attack_id,
        'name': tech_name,
        'description': process_description(tech_description),
        'parent_technique': parent_technique_info,
        'sub_techniques': current_subtechniques,
        'tactics': tactics_info,
        'related_groups': groups_info,
        'mitigations': mitigations_info,
        'data_sources': data_sources_info,
        'general_detection_text': process_description(general_detection_text),
        'specific_detection_narratives': specific_detection_narratives,
    }
    md_content = technique_template.render(technique=technique_data)
    # Sanitize ATT&CK ID for filename (e.g. T1234.001 -> T1234_001)
    filename_attack_id = tech_attack_id.replace('.', '_')
    filename = os.path.join(docs_dir, 'techniques', f"{filename_attack_id}.md")
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(md_content)

In [None]:
# --- Generate markdown files for Groups ---
group_template = env.get_template('group.md.j2')

for group in groups:
    group_attack_id = group.get('group_id') 
    if not group_attack_id:
        # print(f"Skipping group due to missing ATT&CK ID: {group.get('name', 'Unknown Group')}")
        continue
        
    group_name = group.get('group', 'Unknown Group')
    group_description = group.get('group_description', '')
    group_aliases = group.get('group_aliases', [])
    group_stix_id = group.get('id') # STIX ID for relationship lookups

    # Techniques Used by this Group
    # Relationships: Group (source_object) --uses--> Technique (target_object)
    techniques_used_rels = [
        rel for rel in relationships 
        if rel.get('source_object') == group_stix_id and \
           rel.get('relationship') == 'uses' and \
           rel.get('target_object','').startswith('attack-pattern--')
    ]
    technique_info = [{
            'id': technique_stix_to_attack.get(rel['target_object']), 
            'name': technique_stix_id_to_name.get(rel['target_object'])
        } 
        for rel in techniques_used_rels 
        if rel.get('target_object') in technique_stix_to_attack and rel.get('target_object') in technique_stix_id_to_name
    ]
    technique_info = [t for t in technique_info if t.get('id') and t.get('name')]
        
    group_data = {
        'id': group_attack_id,
        'name': group_name,
        'description': process_description(group_description),
        'aliases': group_aliases,
        'techniques_used': technique_info
    }
    
    md_content = group_template.render(group=group_data)
    filename = os.path.join(docs_dir, 'groups', f"{group_attack_id}.md")
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(md_content)