In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.shared.json_tools import load_json_long
from paths import DATA_DIR
import os

known_attack_dataset = []
directory = DATA_DIR / 'commands'

def recursive_load_jsons(directory):
    for entry in os.scandir(directory):
        if entry.is_dir():
            recursive_load_jsons(entry.path)
        elif entry.is_file() and entry.name.endswith('.json'):
            try:
                data = load_json_long(entry.path)
                known_attack_dataset.append(data)
            except Exception as e:
                print(f"Failed to load {entry.path}: {e}")
                
recursive_load_jsons(directory)

In [3]:
len(known_attack_dataset)

267

In [12]:
types = [i["cmd_type"] for logs in known_attack_dataset for i in logs]

In [14]:
set(types)

{'bash-command', 'msf-command'}

In [22]:
import random as rd
def convert_command_entry(entry):
    try:
        timestamp = ''
    
        # Extract command and arguments
        cmd_parts = entry['cmd'].split()
        pid = rd.randint(1000, 99999) 
    
        # Base structure
        syscall_entry = {
            "timestamp": timestamp,
            "success": 1,
            "uid": "1000" if entry.get("username") != "root" else "0",
            "euid": "0" if entry.get("username") == "root" else "1000",
            "syscall": "0",
            "ppid": pid,
            "pid": pid,
            "command": cmd_parts[0],
            "arguments": cmd_parts,
            "CWD": entry.get("wd", "/")
        }
    
        return syscall_entry
    except Exception as e:
        return 0

dt = [[convert_command_entry(entry) for entry in logs] for logs in known_attack_dataset]

In [23]:
dt = [[entry for entry in logs if entry != 0 ] for logs in dt]

In [24]:
len([entry for logs in dt for entry in logs])

21089

In [31]:
from pathlib import Path
good_data = []
for log_file in Path(DATA_DIR / 'old').glob('*.json'):
    logs = load_json_long(log_file)
    logs = [i["content"] for i in logs if i["target"] < 0.5]
    good_data.append(logs)

In [32]:
len(good_data)

13

In [33]:
len([entry for logs in good_data for entry in logs])

71852

In [34]:
len(dt)

267

In [35]:
good_data = [entry for logs in good_data for entry in logs]

In [36]:
transformed_mixed_data = []
for bad_subset in dt:
    random_num = rd.randint(1, 3)
    random_length = rd.randint(int(len(bad_subset) * 0.75), int(len(bad_subset) * 1.5))
    good_subset = good_data[:random_length]
    good_data = good_data[random_length:]
    
    if random_num == 1:
        transformed_mixed_data.append(good_subset + bad_subset)
    elif random_num == 2:
        transformed_mixed_data.append(bad_subset + good_subset)
    else:
        transformed_mixed_data.append(rd.shuffle(good_subset + bad_subset))

In [37]:
len(good_data)

48403

In [38]:
def generate_random_subsets(data, num_subsets=270, min_len=91, max_len=252):
    subsets = []
    for _ in range(num_subsets):
        k = rd.randint(min_len, min(max_len, len(data)))  # ensure k ≤ len(data)
        subset = rd.sample(data, k)
        subsets.append(subset)
    return subsets

good_data = generate_random_subsets(good_data, num_subsets=270, min_len=91, max_len=252)

In [39]:
len(good_data)

270

In [40]:
good_data[0][0]

{'timestamp': '1704027385.152',
 'success': 1,
 'uid': '1000',
 'euid': '0',
 'syscall': '0',
 'ppid': '7953',
 'pid': '7953',
 'command': 'mkdir',
 'arguments': ['mkdir',
  '-p',
  '/var/tmp/mkinitramfs_ex6deS//usr/lib/modules/6.2.0-39-generic/kernel/drivers/net/ethernet/stmicro/stmmac'],
 'CWD': '/'}

In [41]:
import random
import time
from datetime import datetime, timedelta

def add_random_timestamps(events):
    # Start with a random date in 2025
    start_dt = datetime(
        year=2025,
        month=random.randint(1, 12),
        day=random.randint(1, 28),  # Keep it simple for all months
        hour=random.randint(0, 23),
        minute=random.randint(0, 59),
        second=random.randint(0, 59)
    )
    
    current_dt = start_dt

    for event in events:
        # Convert to epoch with microseconds as string
        event["timestamp"] = f"{current_dt.timestamp():.3f}"
        # Add a random time delta (3s to 240s) for the next event
        delta_seconds = random.randint(3, 240)
        current_dt += timedelta(seconds=delta_seconds)

    return events

add_random_timestamps(good_data[0])

[{'timestamp': '1765392099.000',
  'success': 1,
  'uid': '1000',
  'euid': '0',
  'syscall': '0',
  'ppid': '7953',
  'pid': '7953',
  'command': 'mkdir',
  'arguments': ['mkdir',
   '-p',
   '/var/tmp/mkinitramfs_ex6deS//usr/lib/modules/6.2.0-39-generic/kernel/drivers/net/ethernet/stmicro/stmmac'],
  'CWD': '/'},
 {'timestamp': '1765392156.000',
  'success': 1,
  'uid': '1000',
  'euid': '0',
  'syscall': '0',
  'ppid': '14207',
  'pid': '14207',
  'command': 'modinfo',
  'arguments': ['modinfo',
   '-k',
   '6.2.0-36-generic',
   '-F',
   'firmware',
   '/lib/modules/6.2.0-36-generic/kernel/drivers/net/ethernet/sis/sis190.ko'],
  'CWD': '/'},
 {'timestamp': '1765392246.000',
  'success': 1,
  'uid': '4294967295',
  'euid': '0',
  'syscall': '0',
  'ppid': '18548',
  'pid': '18548',
  'command': 'modinfo',
  'arguments': ['modinfo',
   '-k',
   '6.2.0-36-generic',
   '-F',
   'firmware',
   '/lib/modules/6.2.0-36-generic/kernel/drivers/ata/pata_hpt37x.ko'],
  'CWD': '/'},
 {'timestam

In [42]:
good_data = [add_random_timestamps(logs) for logs in good_data]

In [46]:
transformed_mixed_data = [i for i in transformed_mixed_data if i is not None and len(i) > 0]

In [47]:
transformed_mixed_data = [add_random_timestamps(logs) for logs in transformed_mixed_data]

In [48]:
from src.shared.json_tools import write_json_long

for i, logs in enumerate(transformed_mixed_data):
    write_json_long(logs, DATA_DIR / 'testing_data' / 'malicious' / f"{i}.json")
    
for i, logs in enumerate(good_data):
    write_json_long(logs, DATA_DIR / 'testing_data' / 'benign' / f"{i}.json")