In [None]:
%pip install -r requirements.txt

In [47]:
import polars as pl
import pandas
import os, duckdb

# Data Definition Language

In [15]:
duckdb.sql("CREATE TABLE route53 as select * from parquet_scan('data/route53/*/*/*.gz.parquet', hive_partitioning=false)")

In [11]:
duckdb.sql("CREATE TABLE eks as select * from parquet_scan('data/eks_audit/*/*/*.gz.parquet', hive_partitioning=false)")

In [82]:
duckdb.sql("CREATE TABLE sechub as select * from read_parquet('data/sh_findings/*/*/*.gz.parquet', hive_partitioning=false)")

In [62]:
duckdb.sql("CREATE TABLE cloudtrail as select * from read_parquet('data/cloudtrail/*/*/*.gz.parquet', hive_partitioning=false)")

In [102]:
duckdb.sql("CREATE TABLE vpcflow as select * from read_parquet('data/vpcflow/*/*/*.gz.parquet', hive_partitioning=false)")

# Creating helpful views

The Amazon SecurityHub integration with Security Lake streams ALL finding changes to our buckets. While this maybe helpful for certain analytics, often teams want to understand what the current state of SecurityHub is and its issues across accounts.

These can be replicated in AWS Athena: [AWS Athena User Guide - Working with views](https://docs.aws.amazon.com/athena/latest/ug/views.html)

### Creating a view of the current status of all SecurityHub findings based on the latest record per unique finding ID

```sql
    CREATE VIEW sechub_current as 
    SELECT sechub.* FROM sechub 
    JOIN (SELECT MAX(time_dt) AS latest, finding_info.uid FROM sechub GROUP BY finding_info.uid) latest_finding_status 
    ON sechub.finding_info.uid = latest_finding_status.uid AND sechub.time_dt = latest_finding_status.latest;
```


In [None]:
duckdb.sql("""
    CREATE VIEW sechub_current as 
    SELECT sechub.* FROM sechub 
    JOIN (SELECT MAX(time_dt) AS latest, finding_info.uid FROM sechub GROUP BY finding_info.uid) latest_finding_status 
    ON sechub.finding_info.uid = latest_finding_status.uid AND sechub.time_dt = latest_finding_status.latest;
""")

### Active Findings by AWS Account

```sql
SELECT cloud.account.uid, severity, count(*) AS cnt 
FROM sechub_current 
WHERE status IN ('New', 'Notified') 
GROUP BY cloud.account.uid, severity
```

In [104]:
duckdb.sql("select cloud.account.uid, severity, count(*) as cnt from sechub_current where status in ('New', 'Notified') group by cloud.account.uid, severity")

┌──────────────┬───────────────┬───────┐
│     uid      │   severity    │  cnt  │
│   varchar    │    varchar    │ int64 │
├──────────────┼───────────────┼───────┤
│ 137294155267 │ High          │   141 │
│ 137294155267 │ Informational │     8 │
│ 137294155267 │ Medium        │   154 │
│ 137294155267 │ Critical      │     9 │
│ 137294155267 │ Low           │    27 │
│ 637423320304 │ Medium        │     5 │
└──────────────┴───────────────┴───────┘

In [107]:
duckdb.sql("select cloud.account.uid, class_name, count(*) as count from sechub_current group by cloud.account.uid, class_name order by cloud.account.uid")

┌──────────────┬───────────────────────┬───────┐
│     uid      │      class_name       │ count │
│   varchar    │        varchar        │ int64 │
├──────────────┼───────────────────────┼───────┤
│ 137294155267 │ Compliance Finding    │     1 │
│ 137294155267 │ Detection Finding     │   288 │
│ 137294155267 │ Vulnerability Finding │    50 │
│ 637423320304 │ Detection Finding     │     4 │
│ 637423320304 │ Compliance Finding    │     3 │
└──────────────┴───────────────────────┴───────┘

# Utilizing Observables for Fun and Profit

OCSF provides an observables column, which is designed to surface common entities across an event. In the case of SecurityHub findings, SecurityLake will transform ASFF findings and include things like AWS Resources, or GuardDuty finding targets in the observables. 

#### AWS Inspector Container vulnerability scan

```json
[
  {
    "name": "resource.uid",
    "value": "arn:aws:ecr:us-east-1:137294155267:repository/juiceshop/sha256:1ee9b8d6e89e8faee4c4c1a7a31931509032f8ac095e1d3664bc5a49c1ee778b",
    "type": "Resource UID",
    "type_id": 10
  }
]
```

#### AWS GuardDuty Finding

```json
[
  {
    "name": "resources[].uid",
    "value": "arn:aws:ec2:us-east-1:137294155267:instance/i-99999999",
    "type": "Resource UID",
    "type_id": 10
  },
  {
    "name": "evidences[].actor.process.user.name",
    "value": "ec2-user",
    "type": "User Name",
    "type_id": 4
  },
  {
    "name": "resources[].uid",
    "value": "GeneratedFindingContainerId",
    "type": "Resource UID",
    "type_id": 10
  }
]
```

In [101]:
duckdb.sql("select resources.value as value, count(*) as count from (select unnest(observables) as resources from sechub_current) where (value like 'arn%' or value like 'AWS%') group by resources.value order by count desc")

┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────┐
│                                                    value                                                     │ count │
│                                                   varchar                                                    │ int64 │
├──────────────────────────────────────────────────────────────────────────────────────────────────────────────┼───────┤
│ arn:aws:ec2:us-east-1:137294155267:instance/i-99999999                                                       │   230 │
│ arn:aws:eks:us-east-1:137294155267:cluster/GeneratedFindingEKSClusterName                                    │    73 │
│ AWS::IAM::AccessKey:GeneratedFindingAccessKeyId                                                              │    57 │
│ arn:aws:ecr:us-east-1:137294155267:repository/juiceshop/sha256:1ee9b8d6e89e8faee4c4c1a7a31931509032f8ac095…  │    48 │
│ arn:aws:ecs:region:12345678900

# event classes across log sources



In [108]:
duckdb.sql('select class_name, count(class_name) as cnt from cloudtrail group by class_name')

┌────────────────┬────────┐
│   class_name   │  cnt   │
│    varchar     │ int64  │
├────────────────┼────────┤
│ Account Change │     17 │
│ Authentication │  20703 │
│ API Activity   │ 440988 │
└────────────────┴────────┘

In [77]:
duckdb.sql('select class_name, count(class_name) as cnt from sechub group by class_name')

┌───────────────────────┬───────┐
│      class_name       │  cnt  │
│        varchar        │ int64 │
├───────────────────────┼───────┤
│ Vulnerability Finding │     2 │
│ Detection Finding     │   579 │
│ Compliance Finding    │    91 │
└───────────────────────┴───────┘

In [76]:
duckdb.sql('select class_name, count(class_name) as cnt from route53 group by class_name')

┌──────────────┬───────┐
│  class_name  │  cnt  │
│   varchar    │ int64 │
├──────────────┼───────┤
│ DNS Activity │ 18924 │
└──────────────┴───────┘

In [72]:
duckdb.sql('select distinct class_name from eks')

┌──────────────┐
│  class_name  │
│   varchar    │
├──────────────┤
│ API Activity │
└──────────────┘

# Adding Context with External Data Sources

# Threat Hunting Queries!!!!

In [124]:
duckdb.sql('select distinct api.operation from cloudtrail')

┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│                                                         api                                                          │
│ struct(response struct(error varchar, message varchar, "data" varchar), operation varchar, "version" varchar, serv…  │
├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ {'response': NULL, 'operation': GetBucketAcl, 'version': NULL, 'service': {'name': s3.amazonaws.com}, 'request': {…  │
│ {'response': NULL, 'operation': GetBucketAcl, 'version': NULL, 'service': {'name': s3.amazonaws.com}, 'request': {…  │
│ {'response': NULL, 'operation': GetBucketAcl, 'version': NULL, 'service': {'name': s3.amazonaws.com}, 'request': {…  │
│ {'response': {'error': NULL, 'message': NULL, 'data': {"credentials":{"accessKeyId":"ASIAR7526RYB7JNRDPWR","sessio…  │
│ {'response': {'error': NULL, '