# 01 – Data Loading and Initial Preprocessing

This notebook loads the raw AWS CloudTrail logs (`attack.json`)
and converts them into a clean, structured event-level table.

At this stage:
- Each row represents a single API event
- No aggregation is performed
- No labels are created
- No machine learning is applied

This notebook serves as the foundation for all subsequent
time-window aggregation and supervised learning steps.


In [1]:
# Core data processing libraries
import json
import pandas as pd
import numpy as np

# Display settings for better readability
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 1000)


In [2]:
# Load raw CloudTrail data from JSON file
with open("../data/raw/attack.json", "r") as f:
    raw_data = json.load(f)

# CloudTrail records are stored under the "Records" key
records = raw_data["Records"]

print(f"Number of raw events: {len(records)}")


Number of raw events: 107116


In [3]:
# Convert nested JSON records into a flat DataFrame
df = pd.json_normalize(records)

print("DataFrame shape:", df.shape)
df.head()


DataFrame shape: (107116, 946)


Unnamed: 0,eventVersion,eventTime,eventSource,eventName,awsRegion,sourceIPAddress,userAgent,responseElements,requestID,eventID,readOnly,eventType,managementEvent,recipientAccountId,eventCategory,userIdentity.type,userIdentity.principalId,userIdentity.arn,userIdentity.accountId,userIdentity.accessKeyId,userIdentity.userName,requestParameters.maxItems,tlsDetails.tlsVersion,tlsDetails.cipherSuite,tlsDetails.clientProvidedHostHeader,...,requestParameters.domain,responseElements.publicIp,responseElements.domain,responseElements.allocationId,responseElements.publicIpv4Pool,responseElements.networkBorderGroup,requestParameters.enableDnsHostnames.value,requestParameters.maxSessionDuration,requestParameters.instanceTenancy,requestParameters.amazonProvidedIpv6CidrBlock,responseElements.vpc.vpcId,responseElements.vpc.state,responseElements.vpc.ownerId,responseElements.vpc.cidrBlock,responseElements.vpc.cidrBlockAssociationSet.items,responseElements.vpc.dhcpOptionsId,responseElements.vpc.instanceTenancy,responseElements.vpc.tagSet.items,responseElements.vpc.isDefault,responseElements.keyPairId,requestParameters.fullyQualifiedArn.arnPrefix.partition,requestParameters.fullyQualifiedArn.arnPrefix.region,requestParameters.fullyQualifiedArn.arnPrefix.account,requestParameters.fullyQualifiedArn.relativeId.functionName,requestParameters.dryRun
0,1.08,2023-04-23T10:45:02Z,lambda.amazonaws.com,ListFunctions20150331,us-east-1,212.179.179.106,aws-sdk-go-v2/1.17.6 os/macos lang/go/1.19.8 m...,,003011fb-368c-4c9c-be0a-06748de885d8,1c5ce35b-9daf-4edb-9caf-48ba82d71bf1,True,AwsApiCall,True,123456789,Management,IAMUser,AIDA3HQ7JY3C4OYVVNDYF,arn:aws:iam::123456789:user/sec-check,123456789,AKIA3HQ7JY3CRTRROFOD,sec-check,10000.0,TLSv1.2,ECDHE-RSA-AES128-GCM-SHA256,lambda.us-east-1.amazonaws.com,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,1.08,2023-04-23T10:44:40Z,iam.amazonaws.com,GetCredentialReport,us-east-1,212.179.179.106,aws-sdk-go-v2/1.17.6 os/macos lang/go/1.19.8 m...,,3977d934-4be7-40aa-8faf-d747825f8704,e247cda3-35b8-4ef8-a440-c2478b6d400f,True,AwsApiCall,True,123456789,Management,IAMUser,AIDA3HQ7JY3C4OYVVNDYF,arn:aws:iam::123456789:user/sec-check,123456789,AKIA3HQ7JY3CRTRROFOD,sec-check,,TLSv1.2,ECDHE-RSA-AES128-GCM-SHA256,iam.amazonaws.com,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,1.08,2023-04-23T10:44:39Z,iam.amazonaws.com,GetCredentialReport,us-east-1,212.179.179.106,aws-sdk-go-v2/1.17.6 os/macos lang/go/1.19.8 m...,,0e50e798-b866-4351-a5f9-18b5ea76a643,d257ea8a-e162-47d4-8b8e-6ae1d14df751,True,AwsApiCall,True,123456789,Management,IAMUser,AIDA3HQ7JY3C4OYVVNDYF,arn:aws:iam::123456789:user/sec-check,123456789,AKIA3HQ7JY3CRTRROFOD,sec-check,,TLSv1.2,ECDHE-RSA-AES128-GCM-SHA256,iam.amazonaws.com,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,1.08,2023-04-23T10:44:30Z,iam.amazonaws.com,GetCredentialReport,us-east-1,212.179.179.106,aws-sdk-go-v2/1.17.6 os/macos lang/go/1.19.8 m...,,74269e4e-1bd1-49ef-9b63-32b0b72c7595,653e7bd9-6d8b-49b5-8fbf-40d443bc06c4,True,AwsApiCall,True,123456789,Management,IAMUser,AIDA3HQ7JY3C4OYVVNDYF,arn:aws:iam::123456789:user/sec-check,123456789,AKIA3HQ7JY3CRTRROFOD,sec-check,,TLSv1.2,ECDHE-RSA-AES128-GCM-SHA256,iam.amazonaws.com,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,1.08,2023-04-23T10:44:26Z,iam.amazonaws.com,GetAccountPasswordPolicy,us-east-1,212.179.179.106,aws-sdk-go-v2/1.17.6 os/macos lang/go/1.19.8 m...,,b7f8cc32-23a6-4904-ba0b-671f894096fa,ef4679c3-6880-4ae2-8679-96bd8872b068,True,AwsApiCall,True,123456789,Management,IAMUser,AIDA3HQ7JY3C4OYVVNDYF,arn:aws:iam::123456789:user/sec-check,123456789,AKIA3HQ7JY3CRTRROFOD,sec-check,,TLSv1.2,ECDHE-RSA-AES128-GCM-SHA256,iam.amazonaws.com,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [4]:
# Select essential columns only
selected_columns = [
    "eventTime",
    "eventName",
    "eventSource",
    "awsRegion",
    "sourceIPAddress",
    "readOnly",
    "managementEvent",
    "userIdentity.userName",
    "userIdentity.type",
    "errorCode"
]

df = df[selected_columns]
df.head()


Unnamed: 0,eventTime,eventName,eventSource,awsRegion,sourceIPAddress,readOnly,managementEvent,userIdentity.userName,userIdentity.type,errorCode
0,2023-04-23T10:45:02Z,ListFunctions20150331,lambda.amazonaws.com,us-east-1,212.179.179.106,True,True,sec-check,IAMUser,
1,2023-04-23T10:44:40Z,GetCredentialReport,iam.amazonaws.com,us-east-1,212.179.179.106,True,True,sec-check,IAMUser,CredentialReportNotPresentException
2,2023-04-23T10:44:39Z,GetCredentialReport,iam.amazonaws.com,us-east-1,212.179.179.106,True,True,sec-check,IAMUser,CredentialReportNotPresentException
3,2023-04-23T10:44:30Z,GetCredentialReport,iam.amazonaws.com,us-east-1,212.179.179.106,True,True,sec-check,IAMUser,CredentialReportNotPresentException
4,2023-04-23T10:44:26Z,GetAccountPasswordPolicy,iam.amazonaws.com,us-east-1,212.179.179.106,True,True,sec-check,IAMUser,NoSuchEntityException


In [5]:
# Replace missing usernames with a consistent placeholder
df["userIdentity.userName"] = df["userIdentity.userName"].fillna("UNKNOWN_USER")

# Replace missing identity types
df["userIdentity.type"] = df["userIdentity.type"].fillna("Unknown")

# Replace missing booleans with False
df["readOnly"] = df["readOnly"].fillna(False).astype(bool)
df["managementEvent"] = df["managementEvent"].fillna(False).astype(bool)


  df["readOnly"] = df["readOnly"].fillna(False).astype(bool)
  df["managementEvent"] = df["managementEvent"].fillna(False).astype(bool)


In [6]:
# Convert eventTime to pandas datetime
df["eventTime"] = pd.to_datetime(df["eventTime"], utc=True)

# Sort events chronologically (important for later windowing)
df = df.sort_values("eventTime").reset_index(drop=True)

df[["eventTime"]].head()

print(df.dtypes)


eventTime                datetime64[ns, UTC]
eventName                             object
eventSource                           object
awsRegion                             object
sourceIPAddress                       object
readOnly                                bool
managementEvent                         bool
userIdentity.userName                 object
userIdentity.type                     object
errorCode                             object
dtype: object


In [7]:
# Number of unique users
df["userIdentity.userName"].nunique()


11

In [8]:
# Top users by number of events
df["userIdentity.userName"].value_counts().head(12)


userIdentity.userName
UNKNOWN_USER                      41775
backup                            37463
Level6                            16244
sec-check                          7375
cloud_user                         3241
test                                710
piper                               131
flaws                               105
Level5                               39
HIDDEN_DUE_TO_SECURITY_REASONS       17
vpc_peering                          16
Name: count, dtype: int64

In [9]:
# Save cleaned event-level data
output_path = "../data/processed/clean_events_2.parquet"
df.to_parquet(output_path, index=False)

print(f"Clean event table saved to: {output_path}")


Clean event table saved to: ../data/processed/clean_events_2.parquet
