# Behavioral Analysis: Raw Response Summaries

This notebook reads VirusTotal behaviour reports under `output/raw_responses`,
extracts rich indicators covering process, network, filesystem, registry, mutex, services,
and writes aggregated results to CSV/JSON for further analysis.

## Imports

In [8]:
from __future__ import annotations

import json
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional

import pandas as pd

In [9]:
HASH_SIGNATURE_LOOKUP: Dict[str, str] = {}
hash_signature_path = Path('output/hash_signature_output.csv')
if hash_signature_path.is_file():
    try:
        df = pd.read_csv(hash_signature_path, dtype=str)
        if not df.empty:
            for _, row in df.dropna(subset=['hash']).iterrows():
                hash_value = str(row['hash']).strip().lower()
                family_value = str(row.get('malware_family', '') or '').strip()
                if hash_value and family_value:
                    HASH_SIGNATURE_LOOKUP.setdefault(hash_value, family_value)
    except Exception as exc:
        print(f"Warning: could not load hash_signature_output.csv: {exc}")
else:
    HASH_SIGNATURE_LOOKUP = {}

## Helper: Flatten Behaviour Indicators

In [10]:

def extract_behavioral_indicators(behav_resp: Dict[str, Any]) -> Dict[str, Any]:
    # Extract key artefacts from a VirusTotal behaviour response.
    indicators = {
        'processes': [],
        'command_executions': [],
        'http_requests': [],
        'dns_queries': [],
        'contacted_ips': [],
        'files_written': [],
        'files_deleted': [],
        'files_read': [],
        'file_paths': set(),
        'registry_keys_written': [],
        'registry_keys_read': [],
        'registry_keys_deleted': [],
        'mutexes': [],
        'tags': [],
        'signature_matches': [],
        'memory_dumps': [],
        'services_created': [],
        'services_started': [],
        'scheduled_tasks': [],
        'mitre_techniques': []
    }

    if not behav_resp or not behav_resp.get('success'):
        indicators['file_paths'] = []
        return indicators

    payload = behav_resp.get('json')
    if not isinstance(payload, dict):
        indicators['file_paths'] = []
        return indicators

    data_section = payload.get('data')

    behaviour_keys = {
        'processes_created', 'processes_tree', 'command_executions', 'http_conversations',
        'dns_lookups', 'ip_traffic', 'files_written', 'files_deleted', 'files_opened',
        'registry_keys_set', 'registry_keys_opened', 'registry_keys_deleted',
        'mutexes_created', 'mutexes_opened', 'services_created', 'services_started',
        'services_opened', 'memory_dumps', 'scheduled_tasks', 'tags', 'signature_matches',
        'mitre_attack_techniques', 'attack_techniques'
    }

    attr_dicts: List[Dict[str, Any]] = []
    seen: set[int] = set()

    def collect_dicts(node: Any) -> None:
        if isinstance(node, dict):
            node_id = id(node)
            if node_id in seen:
                return
            seen.add(node_id)

            attrs = node.get('attributes')
            if isinstance(attrs, dict):
                collect_dicts(attrs)

            if behaviour_keys.intersection(node.keys()):
                attr_dicts.append(node)

            for value in node.values():
                collect_dicts(value)
        elif isinstance(node, list):
            for item in node:
                collect_dicts(item)

    collect_dicts(data_section)

    if not attr_dicts and isinstance(data_section, dict):
        attr_dicts.append(data_section)

    def add_process_entry(name: str, command_line: Optional[str], process_id: Optional[str], parent_pid: Optional[str], injected: Optional[bool]) -> None:
        entry = {
            'name': name,
            'command_line': command_line or name,
            'process_id': process_id,
            'parent_pid': parent_pid,
            'injected': bool(injected) if injected is not None else False,
        }
        indicators['processes'].append(entry)

    def flatten_process_tree(tree: Any, parent_pid: Optional[str] = None) -> None:
        if isinstance(tree, dict):
            name = tree.get('name') or ''
            process_id = tree.get('process_id')
            add_process_entry(name, tree.get('command_line'), process_id, parent_pid, tree.get('injected'))
            children = tree.get('children')
            if isinstance(children, list):
                for child in children:
                    flatten_process_tree(child, process_id)
        elif isinstance(tree, list):
            for item in tree:
                flatten_process_tree(item, parent_pid)

    for attrs in attr_dicts:
        processes_created = attrs.get('processes_created')
        if isinstance(processes_created, list):
            for proc in processes_created:
                if isinstance(proc, str):
                    add_process_entry(proc, proc, None, None, False)
                elif isinstance(proc, dict):
                    add_process_entry(proc.get('name', ''), proc.get('command_line'), proc.get('pid') or proc.get('process_id'), proc.get('ppid'), proc.get('injected'))

        if attrs.get('processes_tree'):
            flatten_process_tree(attrs['processes_tree'])

        cmd_exec = attrs.get('command_executions')
        if isinstance(cmd_exec, list):
            indicators['command_executions'].extend([c for c in cmd_exec if isinstance(c, str)])

        http_conv = attrs.get('http_conversations')
        if isinstance(http_conv, list):
            for conv in http_conv:
                if not isinstance(conv, dict):
                    continue
                headers = conv.get('request_headers') if isinstance(conv.get('request_headers'), dict) else {}
                indicators['http_requests'].append({
                    'method': conv.get('request_method'),
                    'url': conv.get('url'),
                    'host': headers.get('Host'),
                })

        dns = attrs.get('dns_lookups')
        if isinstance(dns, list):
            for entry in dns:
                if isinstance(entry, dict):
                    hostname = entry.get('hostname') or entry.get('domain')
                    if hostname:
                        indicators['dns_queries'].append(hostname)
                elif isinstance(entry, str) and entry:
                    indicators['dns_queries'].append(entry)

        ip_traffic = attrs.get('ip_traffic')
        if isinstance(ip_traffic, list):
            for entry in ip_traffic:
                if isinstance(entry, dict):
                    dst_ip = entry.get('destination_ip') or entry.get('ip')
                    dst_port = entry.get('destination_port') or entry.get('port')
                    if dst_ip:
                        indicators['contacted_ips'].append(f"{dst_ip}:{dst_port}" if dst_port else dst_ip)
                elif isinstance(entry, str):
                    indicators['contacted_ips'].append(entry)

        files_written = attrs.get('files_written')
        if isinstance(files_written, list):
            for path in files_written:
                if isinstance(path, str):
                    indicators['files_written'].append(path)
                    indicators['file_paths'].add(path)

        files_deleted = attrs.get('files_deleted')
        if isinstance(files_deleted, list):
            for path in files_deleted:
                if isinstance(path, str):
                    indicators['files_deleted'].append(path)
                    indicators['file_paths'].add(path)

        files_opened = attrs.get('files_opened')
        if isinstance(files_opened, list):
            for path in files_opened:
                if isinstance(path, str):
                    indicators['files_read'].append(path)
                    indicators['file_paths'].add(path)

        reg_written = attrs.get('registry_keys_set')
        if isinstance(reg_written, list):
            for key in reg_written:
                if isinstance(key, dict):
                    indicators['registry_keys_written'].append(key.get('key'))
                elif isinstance(key, str):
                    indicators['registry_keys_written'].append(key)

        reg_read = attrs.get('registry_keys_opened')
        if isinstance(reg_read, list):
            for key in reg_read:
                if isinstance(key, dict):
                    indicators['registry_keys_read'].append(key.get('key'))
                elif isinstance(key, str):
                    indicators['registry_keys_read'].append(key)

        reg_deleted = attrs.get('registry_keys_deleted')
        if isinstance(reg_deleted, list):
            for key in reg_deleted:
                if isinstance(key, dict):
                    indicators['registry_keys_deleted'].append(key.get('key'))
                elif isinstance(key, str):
                    indicators['registry_keys_deleted'].append(key)

        mutex_created = attrs.get('mutexes_created')
        if isinstance(mutex_created, list):
            for mtx in mutex_created:
                if isinstance(mtx, str):
                    indicators['mutexes'].append(mtx)

        mutex_opened = attrs.get('mutexes_opened')
        if isinstance(mutex_opened, list):
            for mtx in mutex_opened:
                if isinstance(mtx, str):
                    indicators['mutexes'].append(mtx)

        services_created = attrs.get('services_created')
        if isinstance(services_created, list):
            for svc in services_created:
                if isinstance(svc, str):
                    indicators['services_created'].append(svc)

        services_started = attrs.get('services_started')
        if isinstance(services_started, list):
            for svc in services_started:
                if isinstance(svc, str):
                    indicators['services_started'].append(svc)

        scheduled_tasks = attrs.get('scheduled_tasks')
        if isinstance(scheduled_tasks, list):
            for task in scheduled_tasks:
                if isinstance(task, str):
                    indicators['scheduled_tasks'].append(task)

        memory_dumps = attrs.get('memory_dumps')
        if isinstance(memory_dumps, list):
            for dump in memory_dumps:
                if isinstance(dump, str):
                    indicators['memory_dumps'].append(dump)

        mitre_entries = attrs.get('mitre_attack_techniques')
        if isinstance(mitre_entries, list):
            for entry in mitre_entries:
                if isinstance(entry, dict):
                    tid = entry.get('id') or entry.get('technique_id')
                    if tid:
                        indicators['mitre_techniques'].append(tid)

        tags = attrs.get('tags')
        if isinstance(tags, list):
            for tag in tags:
                if isinstance(tag, str):
                    indicators['tags'].append(tag)

        sig_matches = attrs.get('signature_matches')
        if isinstance(sig_matches, list):
            for match in sig_matches:
                if isinstance(match, dict):
                    name = match.get('name') or match.get('id')
                    if name:
                        indicators['signature_matches'].append(name)

    if isinstance(data_section, dict):
        data_candidates = [data_section]
    elif isinstance(data_section, list):
        data_candidates = [node for node in data_section if isinstance(node, dict)]
    else:
        data_candidates = []

    for node in data_candidates:
        node_tags = node.get('tags')
        if isinstance(node_tags, list):
            for tag in node_tags:
                if isinstance(tag, str):
                    indicators['tags'].append(tag)
        node_sigs = node.get('signature_matches')
        if isinstance(node_sigs, list):
            for match in node_sigs:
                if isinstance(match, dict):
                    name = match.get('name') or match.get('id')
                    if name:
                        indicators['signature_matches'].append(name)
        node_memory = node.get('memory_dumps')
        if isinstance(node_memory, list):
            for dump in node_memory:
                if isinstance(dump, str):
                    indicators['memory_dumps'].append(dump)
        node_services_created = node.get('services_created')
        if isinstance(node_services_created, list):
            for svc in node_services_created:
                if isinstance(svc, str):
                    indicators['services_created'].append(svc)
        node_services_started = node.get('services_started')
        if isinstance(node_services_started, list):
            for svc in node_services_started:
                if isinstance(svc, str):
                    indicators['services_started'].append(svc)
        node_tasks = node.get('scheduled_tasks')
        if isinstance(node_tasks, list):
            for task in node_tasks:
                if isinstance(task, str):
                    indicators['scheduled_tasks'].append(task)
        node_mitre = node.get('mitre_attack_techniques')
        if isinstance(node_mitre, list):
            for entry in node_mitre:
                if isinstance(entry, dict):
                    tid = entry.get('id') or entry.get('technique_id')
                    if tid:
                        indicators['mitre_techniques'].append(tid)

    def unique_strings(values: Iterable[str]) -> List[str]:
        seen = set()
        deduped = []
        for value in values:
            if not value:
                continue
            if value not in seen:
                seen.add(value)
                deduped.append(value)
        return deduped

    for key in (
        'command_executions', 'dns_queries', 'contacted_ips', 'files_written', 'files_deleted',
        'files_read', 'registry_keys_written', 'registry_keys_read', 'registry_keys_deleted',
        'mutexes', 'tags', 'signature_matches', 'memory_dumps', 'services_created',
        'services_started', 'scheduled_tasks', 'mitre_techniques'
    ):
        indicators[key] = unique_strings(indicators[key])

    indicators['file_paths'] = unique_strings(indicators['file_paths'])

    return indicators


## Helper: Derive Family from File Report

In [11]:

def derive_family(hash_value: str, raw_dir: Path) -> Optional[str]:
    lookup_key = hash_value.lower()
    mapped_family = HASH_SIGNATURE_LOOKUP.get(lookup_key)
    if mapped_family:
        return mapped_family

    file_path = raw_dir / f"{hash_value}_file.json"
    if not file_path.exists():
        return None
    try:
        data = json.loads(file_path.read_text())
    except Exception:
        return None
    attrs = data.get('json', {}).get('data', {}).get('attributes')
    if not isinstance(attrs, dict):
        return None

    ptc = attrs.get('popular_threat_classification')
    if isinstance(ptc, dict):
        label = ptc.get('suggested_threat_label')
        if isinstance(label, str) and label.strip():
            return label.strip()

    tags = attrs.get('tags')
    if isinstance(tags, list):
        for tag in tags:
            if isinstance(tag, str) and tag.strip():
                return tag.strip()
    return None


In [12]:

def derive_threat_names(hash_value: str, raw_dir: Path) -> List[str]:
    file_path = raw_dir / f"{hash_value}_file.json"
    if not file_path.exists():
        return []
    try:
        data = json.loads(file_path.read_text())
    except Exception:
        return []
    attrs = data.get('json', {}).get('data', {}).get('attributes')
    if not isinstance(attrs, dict):
        return []

    ptc = attrs.get('popular_threat_classification')
    names: List[str] = []
    if isinstance(ptc, dict):
        name_entries = ptc.get('popular_threat_name')
        if isinstance(name_entries, list):
            for entry in name_entries:
                if isinstance(entry, dict):
                    value = entry.get('value')
                    if isinstance(value, str) and value.strip():
                        names.append(value.strip())
                elif isinstance(entry, str) and entry.strip():
                    names.append(entry.strip())

    ordered: List[str] = []
    seen = set()
    for name in names:
        key = name.lower()
        if key not in seen:
            seen.add(key)
            ordered.append(name)
    return ordered


## Helper: Summarise a Behaviour File

In [13]:

def summarise_behavior_file(path: Path, raw_dir: Path) -> Dict[str, Any]:
    record: Dict[str, Any] = {
        'hash': path.name.split('_')[0],
        'file_path': str(path),
        'family': None,
        'status': 'failed',
        'status_code': None,
        'process_count': 0,
        'command_count': 0,
        'http_request_count': 0,
        'dns_query_count': 0,
        'ip_connection_count': 0,
        'files_written_count': 0,
        'files_deleted_count': 0,
        'files_read_count': 0,
        'registry_written_count': 0,
        'registry_read_count': 0,
        'mutex_count': 0,
        'services_created_count': 0,
        'services_started_count': 0,
        'memory_dump_count': 0,
        'scheduled_task_count': 0,
        'mitre_technique_count': 0,
        'process_details': '',
        'command_lines': '',
        'http_requests': '',
        'dns_queries': '',
        'contacted_ips': '',
        'files_written': '',
        'files_deleted': '',
        'files_read': '',
        'registry_written': '',
        'registry_read': '',
        'registry_deleted': '',
        'mutexes': '',
        'services_created': '',
        'services_started': '',
        'memory_dumps': '',
        'scheduled_tasks': '',
        'tags': '',
        'signature_matches': '',
        'mitre_techniques': '',
        'threat_names': []
    }

    try:
        behav_resp = json.loads(path.read_text())
    except Exception as exc:
        record['error'] = f'json_load_failed: {exc}'
        return record

    record['status_code'] = behav_resp.get('status_code')
    record['family'] = derive_family(record['hash'], raw_dir)
    record['threat_names'] = derive_threat_names(record['hash'], raw_dir)

    indicators = extract_behavioral_indicators(behav_resp)

    record['process_count'] = len(indicators['processes'])
    record['command_count'] = len(indicators['command_executions'])
    record['http_request_count'] = len(indicators['http_requests'])
    record['dns_query_count'] = len(indicators['dns_queries'])
    record['ip_connection_count'] = len(indicators['contacted_ips'])
    record['files_written_count'] = len(indicators['files_written'])
    record['files_deleted_count'] = len(indicators['files_deleted'])
    record['files_read_count'] = len(indicators['files_read'])
    record['registry_written_count'] = len(indicators['registry_keys_written'])
    record['registry_read_count'] = len(indicators['registry_keys_read'])
    record['mutex_count'] = len(indicators['mutexes'])
    record['services_created_count'] = len(indicators['services_created'])
    record['services_started_count'] = len(indicators['services_started'])
    record['memory_dump_count'] = len(indicators['memory_dumps'])
    record['scheduled_task_count'] = len(indicators['scheduled_tasks'])
    record['mitre_technique_count'] = len(indicators['mitre_techniques'])

    def serialise_process(entry: Dict[str, Any]) -> str:
        return f"name={entry.get('name','')} | cmd={entry.get('command_line','')} | pid={entry.get('process_id')} | ppid={entry.get('parent_pid')} | injected={entry.get('injected')}"

    record['process_details'] = ' || '.join(serialise_process(p) for p in indicators['processes'])
    record['command_lines'] = ' || '.join(indicators['command_executions'])
    record['http_requests'] = ' || '.join(
        f"{req.get('method')} {req.get('url')} (host={req.get('host')})" for req in indicators['http_requests']
    )
    record['dns_queries'] = ' || '.join(indicators['dns_queries'])
    record['contacted_ips'] = ' || '.join(indicators['contacted_ips'])
    record['files_written'] = ' || '.join(indicators['files_written'])
    record['files_deleted'] = ' || '.join(indicators['files_deleted'])
    record['files_read'] = ' || '.join(indicators['files_read'])
    record['registry_written'] = ' || '.join(filter(None, indicators['registry_keys_written']))
    record['registry_read'] = ' || '.join(filter(None, indicators['registry_keys_read']))
    record['registry_deleted'] = ' || '.join(filter(None, indicators['registry_keys_deleted']))
    record['mutexes'] = ' || '.join(indicators['mutexes'])
    record['services_created'] = ' || '.join(indicators['services_created'])
    record['services_started'] = ' || '.join(indicators['services_started'])
    record['memory_dumps'] = ' || '.join(indicators['memory_dumps'])
    record['scheduled_tasks'] = ' || '.join(indicators['scheduled_tasks'])
    record['tags'] = ' || '.join(indicators['tags'])
    record['signature_matches'] = ' || '.join(indicators['signature_matches'])
    record['mitre_techniques'] = ' || '.join(indicators['mitre_techniques'])

    if behav_resp.get('success') and record['status_code'] == 200:
        record['status'] = 'success'
    else:
        record['status'] = 'failed'

    return record


## Generate Behavioural Summary (status 200 only)

In [14]:

raw_dir = Path('output/raw_responses')
if not raw_dir.is_dir():
    raise SystemExit(f'Behavior directory not found: {raw_dir}')

behavior_files = sorted(raw_dir.glob('*_behavior.json'))
print(f'Found {len(behavior_files)} behaviour files')

records: List[Dict[str, Any]] = []
for file_path in behavior_files:
    row = summarise_behavior_file(file_path, raw_dir)
    if row.get('status_code') == 200:
        records.append(row)

print(f'Accepted {len(records)} files with HTTP 200 behaviour responses')

if not records:
    raise SystemExit('No behaviour files with status 200 to process.')

results_df = pd.DataFrame(records)
results_df


Found 176 behaviour files
Accepted 101 files with HTTP 200 behaviour responses


Unnamed: 0,hash,file_path,family,status,status_code,process_count,command_count,http_request_count,dns_query_count,ip_connection_count,...,registry_deleted,mutexes,services_created,services_started,memory_dumps,scheduled_tasks,tags,signature_matches,mitre_techniques,threat_names
0,01c47b5968afdc923ec354bc78cfeb490d3733f8dab819...,output/raw_responses/01c47b5968afdc923ec354bc7...,AgentTesla,success,200,0,0,0,0,0,...,,,,,,,,,,agenttesla | drop
1,08039481f17de1a125763d6dadc9a91615fa027ad42a4f...,output/raw_responses/08039481f17de1a125763d6da...,Emotet,success,200,10,1,5,0,41,...,,,,,,,,encode data using XOR || PEB access || contain...,T1027 || T1057 || T1143 || T1096,emotet | botx | cmrx
2,080672b68abf1f03ace8f820214e8fe1efd2b9d84e03f7...,output/raw_responses/080672b68abf1f03ace8f8202...,Heodo,success,200,4,0,5,1,0,...,,OneNoteM:AppShared,,,,,,,T1129 || T1140 || T1027.009 || T1057 || T1060 ...,gdnj | onenote | emotet
3,0a34ca695a121a9757c72fc0b78101eefba974896da691...,output/raw_responses/0a34ca695a121a9757c72fc0b...,IcedID,success,200,31,1,5,2,3,...,HKEY_CURRENT_USER\Software\Microsoft\Office\16...,cversions.3.m || Global\552FFA80-3393-423d-867...,,SecurityHealthService,,,PERSISTENCE,antivm_checks_available_memory || dead_connect...,T1082 || T1071 || T1573 || T1055 || T1129 || T...,setter | onenote
4,0b8682fe1ee1d9a8ad485452179e9c8651c68266059107...,output/raw_responses/0b8682fe1ee1d9a8ad4854521...,Emotet,success,200,817,1,0,0,35,...,,\Sessions\1\BaseNamedObjects\Global\SyncRootMa...,,,,,PERSISTENCE || DETECT_DEBUG_ENVIRONMENT || LON...,link function at runtime on Windows || allocat...,T1129 || T1027 || T1083 || T1082 || T1057 || T...,emotet | dump | cmvq
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,f7ec2c9703f551dda92a7c043b302c5bab26a4f91683f9...,output/raw_responses/f7ec2c9703f551dda92a7c043...,AgentTesla,success,200,77,1,2,1,1,...,HKLM\SYSTEM\ControlSet001\Services\WmiApRpl\Pe...,CTF.LBES.MutexDefaultS-1-5-21-1482476501-16455...,,VaultSvc,,,DETECT_DEBUG_ENVIRONMENT,log keystrokes via application hook || create ...,T1056.001 || T1057 || T1518 || T1012 || T1105 ...,msil | agenttesla | agensla
97,fa11e21144149812a1c61c9bf7f8351753aed2348075ce...,output/raw_responses/fa11e21144149812a1c61c9bf...,Emotet,success,200,16,1,2,0,61,...,,\Sessions\1\BaseNamedObjects\Global\SyncRootMa...,,,,,PERSISTENCE || DETECT_DEBUG_ENVIRONMENT || LON...,,T1027 || T1129 || T1083 || T1082 || T1547.001 ...,emotet | gbkk | botx
98,fc345d151b44639631fc6b88a979462dfba3aa5c281ee3...,output/raw_responses/fc345d151b44639631fc6b88a...,Heodo,success,200,12,3,2,0,2,...,,DBWinMutex,,,,,RUNTIME_MODULES || CHECKS_HOSTNAME,,T1027 || T1027.005 || T1129 || T1497.001 || T1...,emotet | gbma | botx
99,fe135d63b84d72468b5e913b2b59ffd1e52911de7438d7...,output/raw_responses/fe135d63b84d72468b5e913b2...,AgentTesla,success,200,7,9,2,0,1,...,,E7256B3F3AD6FDE5 || IESQMMUTEX_0_208 || Local\...,cscc,,,,,,,agenttesla


## Persist Outputs

In [15]:

output_dir = Path('output')
output_dir.mkdir(parents=True, exist_ok=True)

csv_path = output_dir / 'analysis_results_updated.csv'
json_path = output_dir / 'analysis_results_updated.json'

results_df.to_json(json_path, orient='records', indent=2)

csv_df = results_df.copy()
for column in csv_df.columns:
    if csv_df[column].apply(lambda x: isinstance(x, list)).any():
        csv_df[column] = csv_df[column].apply(lambda x: ' || '.join(map(str, x)) if isinstance(x, list) else x)

csv_df.to_csv(csv_path, index=False)

print(f'Saved CSV → {csv_path}')
print(f'Saved JSON → {json_path}')


Saved CSV → output/analysis_results_updated.csv
Saved JSON → output/analysis_results_updated.json
