In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from src.shared.json_tools import load_json_long
from paths import DATA_DIR

data = load_json_long(DATA_DIR / 'test_out/03-04-January.json')

logs = [i.get('content') for i in data]

logs = sorted(logs, key=lambda x: float(x['timestamp']))

In [4]:
logs[0]['timestamp'] < logs[1]['timestamp']

False

In [5]:
logs[0]['timestamp']

'1704303002.205'

In [6]:
logs[1]['timestamp']

'1704303002.205'

In [17]:
def find_idx(logs):
    log_indexes = []

    for idx, log in enumerate(logs):

        log_time = float(log['timestamp'])
        inter_idx = idx
        start_idx = idx
        
        for i in range(idx - 1, -1, -1):
            check_time = float(logs[i]['timestamp'])
            delta = log_time - check_time

            if delta <= 30 and i <= inter_idx:
                inter_idx = i
            if delta <= 330 and i <= start_idx:
                start_idx = i
            else:
                break
        
        log_indexes.append((start_idx, inter_idx, idx))

    return log_indexes if log_indexes else None

log_window_idx = find_idx(logs)

In [18]:
log_window_idx[0]

(0, 0, 0)

In [9]:
from sklearn.decomposition import PCA
import joblib

pca = PCA(n_components=10)
pca.fit([i['embedded_command'] for i in logs])

joblib.dump(pca, 'pca_model.pkl')

['pca_model.pkl']

In [34]:
import numpy as np
def get_features(logs, log_indexes):
    all_metrics = []
    
    for start_idx, inter_idx, event_idx in log_indexes:
        if start_idx == inter_idx == event_idx:
            five_min_window = [logs[event_idx]]
            thirty_sec_window = [logs[event_idx]]
        elif start_idx == inter_idx:
            five_min_window = logs[start_idx:event_idx+1]
            thirty_sec_window = five_min_window
        else:
            five_min_window = logs[start_idx:inter_idx]
            thirty_sec_window = logs[inter_idx:event_idx+1]
        
        if len(five_min_window) > 0: 
            # 5 min
            five_log_count = len(five_min_window)
            five_avg_cwd_risk_score = np.mean([i['cwd_risk'] for i in five_min_window])
            five_avg_arg_count = np.mean([i['args_count'] for i in five_min_window])
            five_avg_flag_count = np.sum([i['flag_count'] for i in five_min_window])
            five_bash_rate = np.sum([i['is_bash_command'] for i in five_min_window]) / five_log_count
            five_success_rate = np.sum([i['success'] for i in five_min_window]) / five_log_count
            five_unique_pids = len(set([i['pid'] for i in five_min_window]))
            five_avg_embedded_command = np.mean([i['embedded_command'] for i in five_min_window], axis=0).reshape(1, -1)
            five_avg_embedded_command = pca.transform(five_avg_embedded_command)
            
            five_min_metrics = {
                'log_count': five_log_count,
                'cwd_avg_risk_score': five_avg_cwd_risk_score,
                'avg_arg_count': five_avg_arg_count,
                'avg_flag_count': five_avg_flag_count,
                'bash_count_rate': five_bash_rate,
                'success_rate': five_success_rate,
                'unique_pids': five_unique_pids,
                'avg_embedded_command': five_avg_embedded_command
            }
        else:
            five_min_metrics = {
                'log_count': 0,
                'cwd_avg_risk_score': 0,
                'avg_arg_count': 0,
                'avg_flag_count': 0,
                'bash_count_rate': 0,
                'success_rate': 0,
                'unique_pids': 0,
                'avg_embedded_command': [0] * 10
            }
        
        # 30 sec
        if len(thirty_sec_window) > 0:
            thirty_log_count = len(five_min_window)
            thirty_avg_cwd_risk_score = np.mean([i['cwd_risk'] for i in thirty_sec_window])
            thirty_avg_arg_count = np.mean([i['args_count'] for i in thirty_sec_window])
            thirty_avg_flag_count = np.sum([i['flag_count'] for i in thirty_sec_window])
            thirty_bash_rate = np.sum([i['is_bash_command'] for i in thirty_sec_window]) / thirty_log_count
            thirty_success_rate = np.sum([i['success'] for i in thirty_sec_window]) / thirty_log_count
            thirty_unique_pids = len(set([i['pid'] for i in thirty_sec_window]))
            thirty_avg_embedded_command = np.mean([i['embedded_command'] for i in thirty_sec_window], axis=0).reshape(1, -1)
            thirty_avg_embedded_command = pca.transform(thirty_avg_embedded_command)
        
            thirty_sec_metrics = {
                'log_count': thirty_log_count,
                'cwd_avg_risk_score': thirty_avg_cwd_risk_score,
                'avg_arg_count': thirty_avg_arg_count,
                'avg_flag_count': thirty_avg_flag_count,
                'bash_count_rate': thirty_bash_rate,
                'success_rate': thirty_success_rate,
                'unique_pids': thirty_unique_pids,
                'avg_embedded_command': thirty_avg_embedded_command
            }
        else:
            thirty_sec_metrics = {
                'log_count': 0,
                'cwd_avg_risk_score': 0,
                'avg_arg_count': 0,
                'avg_flag_count': 0,
                'bash_count': 0,
                'success_rate': 0,
                'unique_pids': 0,
                'avg_embedded_command': [0] * 10
            }
            
        event_metrics = {
            "success": logs[event_idx]['success'],
            "pid": logs[event_idx]['pid'] if type(logs[event_idx]['pid']) == int else eval(logs[event_idx]['pid']),
            "embedded_command": pca.transform(np.array(logs[event_idx]['embedded_command']).reshape(1, -1)),
        }
        
        all_metrics.append({"event": event_metrics,
                              "five_min": five_min_metrics,
                              "thirty_sec": thirty_sec_metrics})
        
    return all_metrics

event_metrics = get_features(logs[:100], log_window_idx[:100])

In [56]:
import numpy as np
def compute_window_metrics(window, pca, default_dim=10):
    if not window:
        return {
            'log_count': 0,
            'cwd_avg_risk_score': 0,
            'avg_arg_count': 0,
            'avg_flag_count': 0,
            'bash_count_rate': 0,
            'success_rate': 0,
            'unique_pids': 0,
            'avg_embedded_command': [0] * default_dim
        }

    log_count = len(window)
    avg_embedded_command = np.mean([i['embedded_command'] for i in window], axis=0).reshape(1, -1)
    transformed_command = pca.transform(avg_embedded_command)

    return {
        'log_count': log_count,
        'cwd_avg_risk_score': np.mean([i['cwd_risk'] for i in window]),
        'avg_arg_count': np.mean([i['args_count'] for i in window]),
        'avg_flag_count': np.sum([i['flag_count'] for i in window]),
        'bash_count_rate': np.sum([i['is_bash_command'] for i in window]) / log_count,
        'success_rate': np.sum([i['success'] for i in window]) / log_count,
        'unique_pids': len(set(i['pid'] for i in window)),
        'avg_embedded_command': transformed_command
    }

def get_features(logs, log_indexes):
    all_metrics = []

    for start_idx, inter_idx, event_idx in log_indexes:
        # Define windows
        if start_idx == inter_idx == event_idx:
            five_min_window = [logs[event_idx]]
            thirty_sec_window = [logs[event_idx]]
        elif start_idx == inter_idx:
            five_min_window = logs[start_idx:event_idx + 1]
            thirty_sec_window = five_min_window
        else:
            five_min_window = logs[start_idx:inter_idx]
            thirty_sec_window = logs[inter_idx:event_idx + 1]

        # Compute metrics
        five_min_metrics = compute_window_metrics(five_min_window, pca)
        thirty_sec_metrics = compute_window_metrics(thirty_sec_window, pca)

        # Event-level metrics
        event = logs[event_idx]
        event_metrics = {
            "success": event['success'],
            "pid": event['pid'] if isinstance(event['pid'], int) else eval(event['pid']),
            "embedded_command": pca.transform(np.array(event['embedded_command']).reshape(1, -1)),
        }

        all_metrics.append({
            "event": event_metrics,
            "five_min": five_min_metrics,
            "thirty_sec": thirty_sec_metrics
        })

    return all_metrics

event_metrics = get_features(logs[:100], log_window_idx[:100])

In [21]:
event_metrics

[{'event': {'success': 1,
   'pid': 4067,
   'embedded_command': array([[ 3.74321558e+00, -6.37168495e-02,  9.91418078e-02,
           -9.54369792e-01,  1.30090456e-01,  6.78650916e-04,
            1.89697108e-01, -8.41446001e-02, -6.19677296e-03,
            8.08371228e-02]])},
  'five_min': {'log_count': 1,
   'cwd_avg_risk_score': np.float64(3.0),
   'avg_arg_count': np.float64(0.0),
   'avg_flag_count': np.int64(0),
   'bash_count_rate': np.float64(1.0),
   'success_rate': np.float64(1.0),
   'unique_pids': 1,
   'avg_embedded_command': array([[ 3.74321558e+00, -6.37168495e-02,  9.91418078e-02,
           -9.54369792e-01,  1.30090456e-01,  6.78650916e-04,
            1.89697108e-01, -8.41446001e-02, -6.19677296e-03,
            8.08371228e-02]])},
  'thirty_sec': {'log_count': 1,
   'cwd_avg_risk_score': np.float64(3.0),
   'avg_arg_count': np.float64(0.0),
   'avg_flag_count': np.int64(0),
   'bash_count_rate': np.float64(1.0),
   'success_rate': np.float64(1.0),
   'unique_pids':

In [61]:
def compute_window_metrics(window, pca, default_dim=10):
    if not window:
        return (
            0,  # log_count
            0,  # cwd_avg_risk_score
            0,  # avg_arg_count
            0,  # avg_flag_count
            0,  # bash_count_rate
            0,  # success_rate
            0,  # unique_pids
            [0] * default_dim  # avg_embedded_command
        )

    log_count = len(window)
    avg_embedded_command = np.mean([i['embedded_command'] for i in window], axis=0).reshape(1, -1)
    transformed_command = pca.transform(avg_embedded_command)

    return (
        log_count,
        np.mean([i['cwd_risk'] for i in window]),
        np.mean([i['args_count'] for i in window]),
        np.sum([i['flag_count'] for i in window]),
        np.sum([i['is_bash_command'] for i in window]) / log_count,
        np.sum([i['success'] for i in window]) / log_count,
        len(set(i['pid'] for i in window)),
        transformed_command
    )

def get_features(logs, log_indexes):
    all_metrics = []

    for start_idx, inter_idx, event_idx in log_indexes:
        # Define windows
        if start_idx == inter_idx == event_idx:
            five_min_window = [logs[event_idx]]
            thirty_sec_window = [logs[event_idx]]
        elif start_idx == inter_idx:
            five_min_window = logs[start_idx:event_idx + 1]
            thirty_sec_window = five_min_window
        else:
            five_min_window = logs[start_idx:inter_idx]
            thirty_sec_window = logs[inter_idx:event_idx + 1]

        # Compute metrics
        five_min_metrics = compute_window_metrics(five_min_window, pca)
        thirty_sec_metrics = compute_window_metrics(thirty_sec_window, pca)

        # Event-level metrics
        event = logs[event_idx]
        pid = event['pid'] if isinstance(event['pid'], int) else eval(event['pid'])
        embedded_command = pca.transform(np.array(event['embedded_command']).reshape(1, -1))

        event_metrics = (
            event['success'],
            pid,
            embedded_command
        )

        all_metrics.append((
            event_metrics,
            five_min_metrics,
            thirty_sec_metrics
        ))

    return all_metrics

event_metrics = get_features(logs[:100], log_window_idx[:100])

In [44]:
logs[0]['pid']

'4067'

In [45]:
logs[0].get('pid')

'4067'

In [62]:
event_metrics[0]

((1,
  4067,
  array([[ 3.74321558e+00, -6.37168495e-02,  9.91418078e-02,
          -9.54369792e-01,  1.30090456e-01,  6.78650916e-04,
           1.89697108e-01, -8.41446001e-02, -6.19677296e-03,
           8.08371228e-02]])),
 (1,
  np.float64(3.0),
  np.float64(0.0),
  np.int64(0),
  np.float64(1.0),
  np.float64(1.0),
  1,
  array([[ 3.74321558e+00, -6.37168495e-02,  9.91418078e-02,
          -9.54369792e-01,  1.30090456e-01,  6.78650916e-04,
           1.89697108e-01, -8.41446001e-02, -6.19677296e-03,
           8.08371228e-02]])),
 (1,
  np.float64(3.0),
  np.float64(0.0),
  np.int64(0),
  np.float64(1.0),
  np.float64(1.0),
  1,
  array([[ 3.74321558e+00, -6.37168495e-02,  9.91418078e-02,
          -9.54369792e-01,  1.30090456e-01,  6.78650916e-04,
           1.89697108e-01, -8.41446001e-02, -6.19677296e-03,
           8.08371228e-02]])))