In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from src.data.aggregator_accelerated import get_labels
from src.shared.json_tools import load_json_long
from paths import DATA_DIR

import numpy as np

target_count = 0
dt_count = 0

for file in (DATA_DIR / "windowed").glob("*.json"):
    data = load_json_long(file)
    target = sum([i["target"] >= 0.5 for i in data])
    dt = len(data)
    
    print(f"{file.name}: {target} targets, {dt} data points")
    target_count += target
    dt_count += dt
    
print(f"Total: {target_count} targets, {dt_count} data points")

03-04-January_0.json: 8 targets, 60 data points
03-04-January_1000.json: 110 targets, 448 data points
03-04-January_128.json: 4 targets, 8 data points
03-04-January_4294967295.json: 1770 targets, 16440 data points
04-06-January_0.json: 5 targets, 64 data points
04-06-January_1000.json: 144 targets, 488 data points
04-06-January_128.json: 3 targets, 6 data points
04-06-January_4294967295.json: 1398 targets, 8469 data points
06-07-January_0.json: 3 targets, 45 data points
06-07-January_1000.json: 110 targets, 265 data points
06-07-January_128.json: 1 targets, 3 data points
06-07-January_4294967295.json: 741 targets, 6196 data points
21-25-December_0.json: 3 targets, 38 data points
21-25-December_1000.json: 34 targets, 118 data points
21-25-December_4294967295.json: 1004 targets, 7123 data points
25-26-December_0.json: 0 targets, 1 data points
25-26-December_4294967295.json: 2165 targets, 9365 data points
26-27-December_0.json: 1 targets, 46 data points
26-27-December_1000.json: 19 target

In [8]:
np.random.seed(42)  # for reproducibility
X = []
y = []

for file in (DATA_DIR / "windowed").glob("*.json"):
    if len(X) >= 1000:
        break
    data = load_json_long(file)
    targets = [int(i["target"] >= 0.5) for i in data]
    features = [i["content"] for i in data]
    
    features = np.array(features)
    targets = np.array(targets)
    
    pos_indices = np.where(targets == 1)[0]
    neg_indices = np.where(targets == 0)[0]
    
    min_class_count = min(len(pos_indices), len(neg_indices))
    pos_sample = np.random.choice(pos_indices, min_class_count, replace=False)
    neg_sample = np.random.choice(neg_indices, min_class_count, replace=False)
    
    balanced_indices = np.concatenate([pos_sample, neg_sample])
    np.random.shuffle(balanced_indices)
    
    features_balanced = features[balanced_indices]
    y_balanced = targets[balanced_indices]
    
    X = X + features_balanced.tolist()
    y = y + y_balanced.tolist()

In [9]:
len(X), len(y)

(3784, 3784)

In [10]:
X[:2]

[[-0.4900478422641754,
  0.051543112844228745,
  -0.8481251001358032,
  0.08602021634578705,
  -0.9295556545257568,
  0.12274473905563354,
  -0.2493138164281845,
  1.0020637512207031,
  -0.5552481412887573,
  0.04898757487535477,
  1.0,
  1.6279513835906982,
  -0.0059578754007816315,
  -0.4738290011882782,
  0.5203844308853149,
  -0.39929723739624023,
  0.06124579906463623,
  -0.029299870133399963,
  0.5431172251701355,
  -0.281063973903656,
  0.06522102653980255,
  4.0,
  3.0,
  0.0,
  0.0,
  0.5,
  1.0,
  2.0,
  1.6279513835906982,
  -0.0059578754007816315,
  -0.4738290011882782,
  0.5203844308853149,
  -0.39929723739624023,
  0.06124579906463623,
  -0.029299870133399963,
  0.5431172251701355,
  -0.281063973903656,
  0.06522102653980255,
  4.0,
  3.0,
  0.0,
  0.0,
  0.5,
  1.0,
  2.0],
 [3.741574287414551,
  -0.06362123787403107,
  -0.09852482378482819,
  0.9539062976837158,
  0.1293705403804779,
  -0.0008886530995368958,
  0.18877190351486206,
  0.0841207280755043,
  -0.00588905811

In [11]:
y[:2]

[0, 0]

In [12]:
sum(y)

1892

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

def get_metrics(y_true, y_pred):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    
    # Confusion Matrix
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    
    # Full Classification Report
    print("Classification Report:\n", classification_report(y_true, y_pred))

In [21]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Get feature importances
importances = clf.feature_importances_

top_indices = np.argsort(importances)[-15:][::-1]

In [22]:
y_pred = clf.predict(X_train)
get_metrics(y_train, y_pred)

Accuracy: 0.9983481995374959
Precision: 0.9993293091884641
Recall: 0.9973226238286479
F1 Score: 0.998324958123953
Confusion Matrix:
 [[1532    1]
 [   4 1490]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1533
           1       1.00      1.00      1.00      1494

    accuracy                           1.00      3027
   macro avg       1.00      1.00      1.00      3027
weighted avg       1.00      1.00      1.00      3027



In [23]:
y_pred = clf.predict(X_test)
get_metrics(y_test, y_pred)

Accuracy: 0.8401585204755614
Precision: 0.816933638443936
Recall: 0.8969849246231156
F1 Score: 0.8550898203592814
Confusion Matrix:
 [[279  80]
 [ 41 357]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.78      0.82       359
           1       0.82      0.90      0.86       398

    accuracy                           0.84       757
   macro avg       0.84      0.84      0.84       757
weighted avg       0.84      0.84      0.84       757



In [24]:
labels = np.array(get_labels())
labels[top_indices]

array(['thirty_sec_bash_count_rate', 'thirty_sec_avg_embedded_command_4',
       'thirty_sec_avg_embedded_command_0',
       'thirty_sec_avg_embedded_command_7',
       'thirty_sec_avg_embedded_command_8',
       'thirty_sec_avg_embedded_command_9', 'thirty_sec_log_count',
       'thirty_sec_avg_embedded_command_5',
       'thirty_sec_avg_embedded_command_3',
       'thirty_sec_avg_embedded_command_2',
       'thirty_sec_avg_embedded_command_1', 'thirty_sec_success_rate',
       'thirty_sec_avg_embedded_command_6',
       'cur_event_avg_embedded_command_0',
       'cur_event_avg_embedded_command_6'], dtype='<U33')

In [31]:
X_train = np.array(X_train)
X_test = np.array(X_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

In [32]:
X_train_smaller = X_train[:, top_indices]
X_test_smaller = X_test[:, top_indices]

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_smaller, y_train)

y_pred = clf.predict(X_train_smaller)
get_metrics(y_train, y_pred)

Accuracy: 0.998017839444995
Precision: 0.9993288590604027
Recall: 0.9966532797858099
F1 Score: 0.9979892761394102
Confusion Matrix:
 [[1532    1]
 [   5 1489]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1533
           1       1.00      1.00      1.00      1494

    accuracy                           1.00      3027
   macro avg       1.00      1.00      1.00      3027
weighted avg       1.00      1.00      1.00      3027



In [33]:
y_pred = clf.predict(X_test_smaller)
get_metrics(y_test, y_pred)

Accuracy: 0.809775429326288
Precision: 0.7939814814814815
Recall: 0.8618090452261307
F1 Score: 0.8265060240963855
Confusion Matrix:
 [[270  89]
 [ 55 343]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.75      0.79       359
           1       0.79      0.86      0.83       398

    accuracy                           0.81       757
   macro avg       0.81      0.81      0.81       757
weighted avg       0.81      0.81      0.81       757



In [34]:
top_indices = np.argsort(importances)[::-1]

In [35]:
for i in top_indices:
    print(f"{labels[i]}: {importances[i]}")

thirty_sec_bash_count_rate: 0.06521222196271538
thirty_sec_avg_embedded_command_4: 0.06114340002265136
thirty_sec_avg_embedded_command_0: 0.05747927482001856
thirty_sec_avg_embedded_command_7: 0.05012050140190896
thirty_sec_avg_embedded_command_8: 0.04028451254535801
thirty_sec_avg_embedded_command_9: 0.039826145391094074
thirty_sec_log_count: 0.039706027657303605
thirty_sec_avg_embedded_command_5: 0.038619305396060316
thirty_sec_avg_embedded_command_3: 0.03844376045182208
thirty_sec_avg_embedded_command_2: 0.036866177128832334
thirty_sec_avg_embedded_command_1: 0.03540100503014095
thirty_sec_success_rate: 0.03519792574989458
thirty_sec_avg_embedded_command_6: 0.03478235737465719
cur_event_avg_embedded_command_0: 0.027714960265120107
cur_event_avg_embedded_command_6: 0.024603657868686894
thirty_sec_unique_pids: 0.02399970662509907
five_min_avg_embedded_command_4: 0.023225468085221777
five_min_bash_count_rate: 0.022718658870717636
five_min_avg_embedded_command_5: 0.02225504943965241
cur

In [36]:
top_indices = [i for i in top_indices if importances[i] >= 0.015]

In [37]:
len(top_indices)

29

In [39]:
labels[top_indices]

array(['thirty_sec_bash_count_rate', 'thirty_sec_avg_embedded_command_4',
       'thirty_sec_avg_embedded_command_0',
       'thirty_sec_avg_embedded_command_7',
       'thirty_sec_avg_embedded_command_8',
       'thirty_sec_avg_embedded_command_9', 'thirty_sec_log_count',
       'thirty_sec_avg_embedded_command_5',
       'thirty_sec_avg_embedded_command_3',
       'thirty_sec_avg_embedded_command_2',
       'thirty_sec_avg_embedded_command_1', 'thirty_sec_success_rate',
       'thirty_sec_avg_embedded_command_6',
       'cur_event_avg_embedded_command_0',
       'cur_event_avg_embedded_command_6', 'thirty_sec_unique_pids',
       'five_min_avg_embedded_command_4', 'five_min_bash_count_rate',
       'five_min_avg_embedded_command_5',
       'cur_event_avg_embedded_command_9',
       'cur_event_avg_embedded_command_7',
       'cur_event_avg_embedded_command_4',
       'cur_event_avg_embedded_command_2', 'five_min_success_rate',
       'cur_event_avg_embedded_command_5', 'five_min_log_c

In [40]:
X_train_smaller = X_train[:, top_indices]
X_test_smaller = X_test[:, top_indices]

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_smaller, y_train)

y_pred = clf.predict(X_train_smaller)
get_metrics(y_train, y_pred)

Accuracy: 0.998017839444995
Precision: 0.9986595174262735
Recall: 0.9973226238286479
F1 Score: 0.9979906229068989
Confusion Matrix:
 [[1531    2]
 [   4 1490]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1533
           1       1.00      1.00      1.00      1494

    accuracy                           1.00      3027
   macro avg       1.00      1.00      1.00      3027
weighted avg       1.00      1.00      1.00      3027



In [41]:
y_pred = clf.predict(X_test_smaller)
get_metrics(y_test, y_pred)

Accuracy: 0.8401585204755614
Precision: 0.8183908045977012
Recall: 0.8944723618090452
F1 Score: 0.8547418967587035
Confusion Matrix:
 [[280  79]
 [ 42 356]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.78      0.82       359
           1       0.82      0.89      0.85       398

    accuracy                           0.84       757
   macro avg       0.84      0.84      0.84       757
weighted avg       0.84      0.84      0.84       757



In [44]:
np.array(top_indices).tolist()

[25,
 15,
 11,
 18,
 19,
 20,
 21,
 16,
 14,
 13,
 12,
 26,
 17,
 0,
 6,
 27,
 32,
 42,
 33,
 9,
 7,
 4,
 2,
 43,
 5,
 38,
 3,
 1,
 8]

In [46]:
np.array([[1, 2], [3, 4]])[0].tolist()

[1, 2]