# 02 – Feature Engineering with Time Windows

In this notebook, we transform raw CloudTrail events into
time-window–level behavioral samples suitable for supervised learning.

Key concepts:
- One sample = (user, 20-minute time window)
- Features describe aggregated behavior within the window
- Labels are assigned at the window level

This notebook produces the final datasets used for:
- Part A: Binary attack detection
- Part B: Multi-class attack classification


In [1]:
import json
import pandas as pd
import numpy as np


In [2]:
# Load cleaned event-level data
df = pd.read_parquet("../data/processed/clean_events_2.parquet")

print("Event-level data shape:", df.shape)
df.head()

df["errorCode"].isna().sum()


Event-level data shape: (107116, 10)


np.int64(92422)

In [3]:
# Load suspicious event list (binary signal)
with open("../src/utils/event_names.py", "r") as f:
    suspicious_events = {line.strip() for line in f if line.strip()}


# Load event → attack category mapping (multiclass signal)
with open("../src/utils/event_category.json", "r") as f:
    EVENT_CATEGORIES = json.load(f)


print("Number of suspicious events:", len(suspicious_events))
print("Number of categorized events:", len(EVENT_CATEGORIES))


Number of suspicious events: 3055
Number of categorized events: 2773


In [4]:
# Binary indicator at event level
df["event_is_attack"] = df["eventName"].isin(suspicious_events).astype(int)

# Multiclass indicator at event level
df["event_attack_type"] = df["eventName"].apply(
    lambda x: EVENT_CATEGORIES.get(x, "Normal")
)

df[["eventName", "event_is_attack", "event_attack_type"]].head()


Unnamed: 0,eventName,event_is_attack,event_attack_type
0,ListBuckets,1,ListResources
1,ListAccountAliases,1,ListResources
2,GetAccountSummary,1,GetInfo
3,ListMFADevices,1,ListResources
4,ListAccountAliases,1,ListResources


In [5]:
# Floor timestamps to 20-minute windows
df["time_window"] = df["eventTime"].dt.floor("30min")

df[["eventTime", "time_window"]].head(10)


Unnamed: 0,eventTime,time_window
0,2017-02-12 19:57:06+00:00,2017-02-12 19:30:00+00:00
1,2017-02-12 19:59:10+00:00,2017-02-12 19:30:00+00:00
2,2017-02-12 19:59:10+00:00,2017-02-12 19:30:00+00:00
3,2017-02-12 19:59:10+00:00,2017-02-12 19:30:00+00:00
4,2017-02-12 19:59:10+00:00,2017-02-12 19:30:00+00:00
5,2017-02-12 19:59:10+00:00,2017-02-12 19:30:00+00:00
6,2017-02-12 19:59:10+00:00,2017-02-12 19:30:00+00:00
7,2017-02-12 19:59:10+00:00,2017-02-12 19:30:00+00:00
8,2017-02-12 19:59:10+00:00,2017-02-12 19:30:00+00:00
9,2017-02-12 19:59:10+00:00,2017-02-12 19:30:00+00:00


In [6]:
group_cols = ["userIdentity.userName", "time_window"]

print(group_cols)

['userIdentity.userName', 'time_window']


In [7]:
window_df = df.groupby(group_cols).agg(
    # Volume-based behavior
    num_events=("eventName", "count"),
    num_unique_events=("eventName", "nunique"),
    num_services=("eventSource", "nunique"),
    num_regions=("awsRegion", "nunique"),
    num_source_ips=("sourceIPAddress", "nunique"),

    # Security-relevant behavior
    num_read_only=("readOnly", "sum"),
    num_management_events=("managementEvent", "sum"),
    

    # Attack signals (used only to derive labels)
    num_attack_events=("event_is_attack", "sum"),
    num_errors=("errorCode", "count")
    
).reset_index()

window_df.head()

print(df.dtypes)

eventTime                datetime64[ns, UTC]
eventName                             object
eventSource                           object
awsRegion                             object
sourceIPAddress                       object
readOnly                                bool
managementEvent                         bool
userIdentity.userName                 object
userIdentity.type                     object
errorCode                             object
event_is_attack                        int64
event_attack_type                     object
time_window              datetime64[ns, UTC]
dtype: object


In [8]:
# Binary label at window level
window_df["label_binary"] = (window_df["num_attack_events"] > 0).astype(int)

window_df["label_binary"].value_counts()


label_binary
0    11843
1     3946
Name: count, dtype: int64

In [9]:
def resolve_window_attack_type(events):
    attacks = events[events != "Normal"]  #סופר את ההתקפות שחוזרות הכי הרבה ולא את הנורמל
    if len(attacks) == 0:  
        return "Normal"
    return attacks.value_counts().idxmax()

#מחזיר את סוג המתקפה שחזרה הכי הרבה פעמים

In [10]:
# Compute multiclass label per window
attack_type_per_window = (
    df.groupby(group_cols)["event_attack_type"]
      .apply(resolve_window_attack_type)
      .reset_index(name="label_multiclass")
)

#print(attack_type_per_window)

window_df = window_df.merge(
    attack_type_per_window,
    on=group_cols,
    how="left"
)

window_df["label_multiclass"].value_counts()


label_multiclass
Normal                    11847
GetInfo                    1885
ListResources              1079
CreateObject                957
Login                         7
ModifyExistingResource        6
AssociateResources            5
Delete                        2
EnableObjects                 1
Name: count, dtype: int64

In [11]:
window_df = window_df.drop(columns=["num_attack_events"])


In [12]:
window_df.head()


Unnamed: 0,userIdentity.userName,time_window,num_events,num_unique_events,num_services,num_regions,num_source_ips,num_read_only,num_management_events,num_errors,label_binary,label_multiclass
0,HIDDEN_DUE_TO_SECURITY_REASONS,2017-05-17 23:00:00+00:00,4,2,1,1,1,0,0,0,1,Login
1,HIDDEN_DUE_TO_SECURITY_REASONS,2018-02-26 01:00:00+00:00,2,2,1,1,1,0,0,0,1,Login
2,HIDDEN_DUE_TO_SECURITY_REASONS,2018-02-26 18:30:00+00:00,7,2,1,1,1,0,0,0,1,Login
3,HIDDEN_DUE_TO_SECURITY_REASONS,2018-02-26 19:00:00+00:00,2,2,1,1,1,0,0,0,1,Login
4,HIDDEN_DUE_TO_SECURITY_REASONS,2018-03-17 17:30:00+00:00,2,2,1,1,1,0,0,0,1,Login


In [13]:
window_df.describe()


Unnamed: 0,num_events,num_unique_events,num_services,num_regions,num_source_ips,num_read_only,num_management_events,num_errors,label_binary
count,15789.0,15789.0,15789.0,15789.0,15789.0,15789.0,15789.0,15789.0,15789.0
mean,6.784217,1.756349,1.310279,1.243461,1.241941,0.467097,0.450694,0.930648,0.249921
std,52.600088,4.161316,1.103147,1.326441,0.577412,24.023833,26.351939,15.141535,0.432981
min,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
max,2728.0,208.0,53.0,15.0,13.0,2728.0,2728.0,1212.0,1.0


In [14]:
print("Number of window samples:", len(window_df))
print("Unique users:", window_df["userIdentity.userName"].nunique())


Number of window samples: 15789
Unique users: 11


In [15]:
window_df.sort_values(by="num_events", ascending=False).head(10)


Unnamed: 0,userIdentity.userName,time_window,num_events,num_unique_events,num_services,num_regions,num_source_ips,num_read_only,num_management_events,num_errors,label_binary,label_multiclass
15786,sec-check,2023-04-23 10:30:00+00:00,2728,40,9,1,1,2728,2728,90,1,GetInfo
3314,UNKNOWN_USER,2017-05-26 22:30:00+00:00,1524,70,17,14,3,14,0,43,1,GetInfo
3340,UNKNOWN_USER,2017-05-27 15:00:00+00:00,1405,74,16,14,2,14,0,77,1,GetInfo
3319,UNKNOWN_USER,2017-05-27 01:00:00+00:00,1231,72,16,14,2,14,0,77,1,GetInfo
764,Level6,2018-02-25 02:00:00+00:00,1222,5,2,1,1,0,0,1212,1,ListResources
3321,UNKNOWN_USER,2017-05-27 02:00:00+00:00,1215,72,16,14,2,14,0,77,1,GetInfo
3327,UNKNOWN_USER,2017-05-27 05:00:00+00:00,1203,73,17,14,3,14,0,77,1,GetInfo
3317,UNKNOWN_USER,2017-05-27 00:00:00+00:00,1199,72,16,14,2,14,0,77,1,GetInfo
3315,UNKNOWN_USER,2017-05-26 23:00:00+00:00,1196,72,16,14,2,14,0,77,1,GetInfo
3323,UNKNOWN_USER,2017-05-27 03:00:00+00:00,1185,72,16,14,2,14,0,77,1,GetInfo


In [16]:
output_path = "../data/processed/window_features_30.parquet"
window_df.to_parquet(output_path, index=False)

print(f"Window-level dataset saved to: {output_path}")


Window-level dataset saved to: ../data/processed/window_features_30.parquet
