In [68]:
import pandas as pd
import re
import json

In [69]:
techniques = pd.read_html('https://attack.mitre.org/techniques/enterprise/')
tech_df = techniques[0]
tech_df

Unnamed: 0,ID,ID.1,Name,Description
0,T1548,T1548,Abuse Elevation Control Mechanism,Adversaries may circumvent mechanisms designed...
1,,.001,Setuid and Setgid,An adversary may abuse configurations where an...
2,,.002,Bypass User Account Control,Adversaries may bypass UAC mechanisms to eleva...
3,,.003,Sudo and Sudo Caching,Adversaries may perform sudo caching and/or us...
4,,.004,Elevated Execution with Prompt,Adversaries may leverage the AuthorizationExec...
...,...,...,...,...
632,,.001,Dead Drop Resolver,"Adversaries may use an existing, legitimate ex..."
633,,.002,Bidirectional Communication,"Adversaries may use an existing, legitimate ex..."
634,,.003,One-Way Communication,"Adversaries may use an existing, legitimate ex..."
635,T1047,T1047,Windows Management Instrumentation,Adversaries may abuse Windows Management Instr...


In [70]:
techniques_dict = {}

current_technique = None

for index, row in tech_df.iterrows():
    if not pd.isna(row['ID']):

        current_technique = row['Name']
        techniques_dict[current_technique] = {'sub-techniques': []}
    else:

        if current_technique is not None:
            techniques_dict[current_technique]['sub-techniques'].append(row['Name'])

In [71]:
techniques_dict

{'Abuse Elevation Control Mechanism': {'sub-techniques': ['Setuid and Setgid',
   'Bypass User Account Control',
   'Sudo and Sudo Caching',
   'Elevated Execution with Prompt',
   'Temporary Elevated Cloud Access',
   'TCC Manipulation']},
 'Access Token Manipulation': {'sub-techniques': ['Token Impersonation/Theft',
   'Create Process with Token',
   'Make and Impersonate Token',
   'Parent PID Spoofing',
   'SID-History Injection']},
 'Account Access Removal': {'sub-techniques': []},
 'Account Discovery': {'sub-techniques': ['Local Account',
   'Domain Account',
   'Email Account',
   'Cloud Account']},
 'Account Manipulation': {'sub-techniques': ['Additional Cloud Credentials',
   'Additional Email Delegate Permissions',
   'Additional Cloud Roles',
   'SSH Authorized Keys',
   'Device Registration',
   'Additional Container Cluster Roles']},
 'Acquire Access': {'sub-techniques': []},
 'Acquire Infrastructure': {'sub-techniques': ['Domains',
   'DNS Server',
   'Virtual Private Ser

In [72]:
techniques_dict['Abuse Elevation Control Mechanism']

{'sub-techniques': ['Setuid and Setgid',
  'Bypass User Account Control',
  'Sudo and Sudo Caching',
  'Elevated Execution with Prompt',
  'Temporary Elevated Cloud Access',
  'TCC Manipulation']}

In [73]:
tabs = pd.read_html('https://attack.mitre.org/matrices/enterprise/#')

In [74]:
df = tabs[0]
df.head()

Unnamed: 0_level_0,Reconnaissance,Resource Development,Initial Access,Execution,Persistence,Privilege Escalation,Defense Evasion,Credential Access,Discovery,Lateral Movement,Collection,Command and Control,Exfiltration,Impact
Unnamed: 0_level_1,10 techniques,8 techniques,10 techniques,14 techniques,20 techniques,14 techniques,43 techniques,17 techniques,32 techniques,9 techniques,17 techniques,18 techniques,9 techniques,14 techniques
0,Active Scanning (3) = Scanning IP Blocks Vul...,Acquire Access Acquire Infrastructure (8) = ...,Content Injection Drive-by Compromise Exploi...,Cloud Administration Command Command and Scri...,Account Manipulation (6) = Additional Cloud C...,Abuse Elevation Control Mechanism (6) = Setui...,Abuse Elevation Control Mechanism (6) = Setui...,Adversary-in-the-Middle (3) = LLMNR/NBT-NS Po...,Account Discovery (4) = Local Account Domain...,Exploitation of Remote Services Internal Spea...,Adversary-in-the-Middle (3) = LLMNR/NBT-NS Po...,Application Layer Protocol (4) = Web Protocol...,Automated Exfiltration (1) = Traffic Duplicat...,Account Access Removal Data Destruction Data...
1,Active Scanning (3),=,Scanning IP Blocks Vulnerability Scanning Wo...,,,,,,,,,,,
2,Active Scanning (3),,,,,,,,,,,,,
3,Gather Victim Host Information (4),=,Hardware Software Firmware Client Configura...,,,,,,,,,,,
4,Gather Victim Host Information (4),,,,,,,,,,,,,


In [75]:
df.iloc[0][0]

'Active Scanning\xa0(3)  = Scanning IP Blocks  Vulnerability Scanning  Wordlist Scanning  Gather Victim Host Information\xa0(4)  = Hardware  Software  Firmware  Client Configurations  Gather Victim Identity Information\xa0(3)  = Credentials  Email Addresses  Employee Names  Gather Victim Network Information\xa0(6)  = Domain Properties  DNS  Network Trust Dependencies  Network Topology  IP Addresses  Network Security Appliances  Gather Victim Org Information\xa0(4)  = Determine Physical Locations  Business Relationships  Identify Business Tempo  Identify Roles  Phishing for Information\xa0(4)  = Spearphishing Service  Spearphishing Attachment  Spearphishing Link  Spearphishing Voice  Search Closed Sources\xa0(2)  = Threat Intel Vendors  Purchase Technical Data  Search Open Technical Databases\xa0(5)  = DNS/Passive DNS  WHOIS  Digital Certificates  CDNs  Scan Databases  Search Open Websites/Domains\xa0(3)  = Social Media  Search Engines  Code Repositories  Search Victim-Owned Websites'

In [76]:
def clean_and_split_text(text):

    cleaned_text = text.replace('= ', '')

    separated_text = re.split(r'\s\s+', cleaned_text)

    cleaned_groups = [re.sub(r'\s*\(\d+\)\s*', '', part).strip() for part in separated_text]

    return cleaned_groups

In [79]:
multi_index_data = df.columns
multi_index_data

MultiIndex([(      'Reconnaissance', '10 techniques'),
            ('Resource Development',  '8 techniques'),
            (      'Initial Access', '10 techniques'),
            (           'Execution', '14 techniques'),
            (         'Persistence', '20 techniques'),
            ('Privilege Escalation', '14 techniques'),
            (     'Defense Evasion', '43 techniques'),
            (   'Credential Access', '17 techniques'),
            (           'Discovery', '32 techniques'),
            (    'Lateral Movement',  '9 techniques'),
            (          'Collection', '17 techniques'),
            ( 'Command and Control', '18 techniques'),
            (        'Exfiltration',  '9 techniques'),
            (              'Impact', '14 techniques')],
           )

In [90]:
tactics_dict = {}

for key, value in multi_index_data:

    tactic_name = key
    num_techniques = int(value.split()[0])

    tactics_dict[tactic_name] = num_techniques

print(tactics_dict)

{'Reconnaissance': 10, 'Resource Development': 8, 'Initial Access': 10, 'Execution': 14, 'Persistence': 20, 'Privilege Escalation': 14, 'Defense Evasion': 43, 'Credential Access': 17, 'Discovery': 32, 'Lateral Movement': 9, 'Collection': 17, 'Command and Control': 18, 'Exfiltration': 9, 'Impact': 14}


In [91]:
t = []
for key in tactics_dict.keys():
  t.append(key)
print(t)

['Reconnaissance', 'Resource Development', 'Initial Access', 'Execution', 'Persistence', 'Privilege Escalation', 'Defense Evasion', 'Credential Access', 'Discovery', 'Lateral Movement', 'Collection', 'Command and Control', 'Exfiltration', 'Impact']


In [92]:
result = {}

for i in range(len(t)):
    chunk = df.iloc[0][i]
    cleaned_text = clean_and_split_text(chunk)

    result[t[i]] = {'techniques': []}

    for technique in techniques_dict.keys():
        if technique in cleaned_text:
            result[t[i]]['techniques'].append(technique)


result



{'Reconnaissance': {'techniques': ['Active Scanning',
   'Gather Victim Host Information',
   'Gather Victim Identity Information',
   'Gather Victim Network Information',
   'Gather Victim Org Information',
   'Phishing for Information',
   'Search Closed Sources',
   'Search Open Technical Databases',
   'Search Open Websites/Domains',
   'Search Victim-Owned Websites']},
 'Resource Development': {'techniques': ['Acquire Access',
   'Acquire Infrastructure',
   'Compromise Accounts',
   'Compromise Infrastructure',
   'Develop Capabilities',
   'Establish Accounts',
   'Obtain Capabilities',
   'Stage Capabilities']},
 'Initial Access': {'techniques': ['Content Injection',
   'Drive-by Compromise',
   'Exploit Public-Facing Application',
   'External Remote Services',
   'Hardware Additions',
   'Phishing',
   'Replication Through Removable Media',
   'Supply Chain Compromise',
   'Trusted Relationship',
   'Valid Accounts']},
 'Execution': {'techniques': ['Cloud Administration Comma

In [97]:
final_json = {}

for tactic, info in result.items():
    techniques = info['techniques']
    final_json[tactic] = {}
    for technique in techniques:
        final_json[tactic][technique] = {
            'sub-techniques': techniques_dict.get(technique, {}).get('sub-techniques', [])
        }

final_json_str = json.dumps(final_json, indent=4)

print(final_json_str)

{
    "Reconnaissance": {
        "Active Scanning": {
            "sub-techniques": [
                "Scanning IP Blocks",
                "Vulnerability Scanning",
                "Wordlist Scanning"
            ]
        },
        "Gather Victim Host Information": {
            "sub-techniques": [
                "Hardware",
                "Software",
                "Firmware",
                "Client Configurations"
            ]
        },
        "Gather Victim Identity Information": {
            "sub-techniques": [
                "Credentials",
                "Email Addresses",
                "Employee Names"
            ]
        },
        "Gather Victim Network Information": {
            "sub-techniques": [
                "Domain Properties",
                "DNS",
                "Network Trust Dependencies",
                "Network Topology",
                "IP Addresses",
                "Network Security Appliances"
            ]
        },
        "Gather Vic

In [98]:
def get_tactic_json(tactic_name):
    if tactic_name in final_json:
        return json.dumps(final_json[tactic_name], indent=4)
    else:
        return json.dumps({"error": f"No data found for tactic '{tactic_name}'"})

In [105]:
print(get_tactic_json('Persistence'))

{
    "Account Manipulation": {
        "sub-techniques": [
            "Additional Cloud Credentials",
            "Additional Email Delegate Permissions",
            "Additional Cloud Roles",
            "SSH Authorized Keys",
            "Device Registration",
            "Additional Container Cluster Roles"
        ]
    },
    "BITS Jobs": {
        "sub-techniques": []
    },
    "Boot or Logon Autostart Execution": {
        "sub-techniques": [
            "Registry Run Keys / Startup Folder",
            "Authentication Package",
            "Time Providers",
            "Winlogon Helper DLL",
            "Security Support Provider",
            "Kernel Modules and Extensions",
            "Re-opened Applications",
            "LSASS Driver",
            "Shortcut Modification",
            "Port Monitors",
            "Print Processors",
            "XDG Autostart Entries",
            "Active Setup",
            "Login Items"
        ]
    },
    "Boot or Logon Initializatio

In [106]:
def get_technique_json(technique_name):
    tactic_found = []
    sub_techniques = []

    for tactic, techniques in final_json.items():
        if technique_name in techniques:
            tactic_found.append(tactic)
            sub_techniques = techniques[technique_name].get('sub-techniques', [])

    if tactic_found:
        response = {
            "Technique": technique_name,
            "Tactics": tactic_found,
            "Sub-techniques": sub_techniques
        }
        return json.dumps(response, indent=4)
    else:
        return json.dumps({"error": f"No data found for technique '{technique_name}'"})

In [107]:
print(get_technique_json('Scheduled Task/Job'))

{
    "Technique": "Scheduled Task/Job",
    "Tactics": [
        "Execution",
        "Persistence",
        "Privilege Escalation"
    ],
    "Sub-techniques": [
        "At",
        "Cron",
        "Scheduled Task",
        "Systemd Timers",
        "Container Orchestration Job"
    ]
}
