In [1]:
import os
import re
import pandas as pd
from pprint import pprint

dataset_csv_path = '/kaggle/input/unraveled-advanced-persistent-threats-dataset/unraveled APT/data/host-logs'
processed_csv_path = '/kaggle/working/processed'

if not os.path.exists(processed_csv_path): 
    os.makedirs(processed_csv_path)
    
def extract_log_type(filename):
    pattern = r'(windows-\w+-?\w*)_\w+.csv'
    match = re.search(pattern, filename)
    if match:
        return match.group(1)
    else:
        return None

In [2]:
host_log_dict = {
    'audit': [],
    'auth': [],
    'filebeat': [],
    'syslog': [],
}

for key in host_log_dict:
    dir_path = os.path.join(dataset_csv_path, key)
    for filename in os.listdir(dir_path):
        complete_path = os.path.join(dir_path, filename)
        
        l = host_log_dict[key]
        l.append(complete_path)
        host_log_dict[key] = l
        
skip = ['windows-administrativeevents-ansible', 
        'windows-administrativeevents-user', 
        'windows-applicationevents-ansible', 
        'windows-applicationevents-user', 
        'windows-systemevents-ansible', 
        'windows-systemevents-user', 
        'windows-administrativeevents', 
        'windows-applicationevents', 
        'windows-systemevents']

dir_path = os.path.join(dataset_csv_path, 'windows')
for filename in os.listdir(dir_path):
    complete_path = os.path.join(dir_path, filename)
    
    log_type = extract_log_type(filename)
    if log_type in skip:
        continue
    print(log_type)
    l = host_log_dict.get(log_type, [])
    l.append(complete_path)
    host_log_dict[log_type] = l

windows-eventviewer
windows-applicationlogs
windows-securityevents-ansible
windows-securityevents
windows-securityevents-user
windows-securityevents-user
windows-administrativelogs
windows-securityevents-ansible
windows-securityevents


In [3]:
dtype = {
    'Activity': str, 
    'Stage': str, 
    'DefenderResponse': str, 
    'Signature': str}

collections = {}

for key, value in host_log_dict.items():
    print('Prefix', key)
    pprint(value)
    
    df = pd.concat([pd.read_csv(filename, 
                                dtype=dtype, 
                                on_bad_lines='skip') for filename in value], 
                   ignore_index = True)
    
    df.to_csv(os.path.join(processed_csv_path, f'cleaned_{key}.csv'), encoding = 'utf-8', index = False)
    
    collections[key] = df

Prefix audit
['/kaggle/input/unraveled-advanced-persistent-threats-dataset/unraveled '
 'APT/data/host-logs/audit/10_1_1_11-audit_labeled',
 '/kaggle/input/unraveled-advanced-persistent-threats-dataset/unraveled '
 'APT/data/host-logs/audit/10_1_3_11-audit_labeled',
 '/kaggle/input/unraveled-advanced-persistent-threats-dataset/unraveled '
 'APT/data/host-logs/audit/10_1_2_10-audit_labeled',
 '/kaggle/input/unraveled-advanced-persistent-threats-dataset/unraveled '
 'APT/data/host-logs/audit/10_1_2_17-audit_labeled',
 '/kaggle/input/unraveled-advanced-persistent-threats-dataset/unraveled '
 'APT/data/host-logs/audit/10_1_3_12-audit_labeled',
 '/kaggle/input/unraveled-advanced-persistent-threats-dataset/unraveled '
 'APT/data/host-logs/audit/10_1_1_4-audit_labeled',
 '/kaggle/input/unraveled-advanced-persistent-threats-dataset/unraveled '
 'APT/data/host-logs/audit/10_1_1_12-audit_labeled']
Prefix auth
['/kaggle/input/unraveled-advanced-persistent-threats-dataset/unraveled '
 'APT/data/ho

# Label Distribution in Each Log

In [4]:
labels = ['Activity', 'Stage', 'DefenderResponse', 'Signature']
for label in labels:
    print(label)
    for key in collections:
        print(key)
        pprint(collections[key][label].value_counts())
        print()
    print()

Activity
audit
Activity
Normal    197923
Name: count, dtype: int64

auth
Activity
Normal                       72826
Benign                       16302
Network Service Discovery       38
Maintain Access                 36
Name: count, dtype: int64

filebeat
Activity
Normal    975
Name: count, dtype: int64

syslog
Activity
Normal    137273
Name: count, dtype: int64

windows-eventviewer
Activity
Normal    35889
Name: count, dtype: int64

windows-applicationlogs
Activity
Normal      22758
Phishing        6
Name: count, dtype: int64

windows-securityevents-ansible
Activity
Normal                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [5]:
# https://unix.stackexchange.com/questions/293975/undocumented-format-of-linux-audit-log-records

audit_field_set = set()
audit_arr = []
for log_event in collections['audit']['LogEvent']:
    log_event_arr = log_event.split(' ')
    log_event_dict = dict()
    for log_event_pair in log_event_arr:
        if '=' in log_event_pair:
            log_event_key_val = log_event_pair.split('=')
            key = log_event_key_val[0]
            val = log_event_key_val[1]
            audit_field_set.add(log_event_key_val[0])
            log_event_dict[key] = val
    audit_arr.append(log_event_dict)

In [6]:
auth_arr = []

pattern = '(?P<date>\w+ \d+ \d+:\d+:\d+) (?P<hostname>\w+) (?P<process_name_id>\w+\[\d+\]): (?P<event_log>.+)'

for log_event in collections['auth']['LogEvent']:
    match = re.search(pattern, log_event)

    if match:
        auth_arr.append(match.groupdict())