# Snapshots Demo

## Step One: Graph Creation / Importing

In [14]:
import json
import gzip
from torch_geometric.data import Data
import torch
import os
import re
import networkx as nx
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil import parser
from datetime import timedelta

Firstly, we have to import our data (in the case of using other datasets this is probably the biggest section to edit.)

In [2]:
merged_dataset = []

for root, dirs, files in os.walk("snapshot_data"):
    for file in files:
        if file.endswith(".json.gz"):
            file_path = os.path.join(root, file)

            with gzip.open(file_path, "rt") as f:
                for line in f:
                    try:
                        data = json.loads(line)
                        merged_dataset.append(data)
                    except json.JSONDecodeError as e:
                        print(f"Error parsing line in {file_path}: {e}")

Then we look at the first few entries to understand what we want to use in our graph.

In [3]:
print(merged_dataset[0:5])

[{'timestamp': '2019-09-24T10:02:46.358-04:00', 'id': '2aa91c3d-9253-4cb2-8585-1eb43db675ed', 'hostname': 'SysClient0968.systemia.com', 'objectID': 'c4d4e50c-1075-4a35-8331-662db77dc65e', 'object': 'FLOW', 'action': 'INFO', 'actorID': 'e27d5804-74e7-454e-af19-b4501c0e99d2', 'pid': 6012, 'ppid': 6004, 'tid': -1, 'principal': 'SYSTEMIACOM\\fmarisei', 'properties': {'acuity_level': '1', 'bro_uid': 'CJn14UbY78b0N89Sl', 'dest_ip': '98.61.14.5', 'dest_port': '80', 'direction': 'outbound', 'image_path': '\\\\?\\C:\\Program Files (x86)\\Mozilla Firefox\\firefox.exe', 'l4protocol': '6', 'src_ip': '142.20.59.201', 'src_port': '49789'}}, {'timestamp': '2019-09-24T10:02:46.37-04:00', 'id': 'cabf9e99-b3ed-4834-9b54-8f4a0e7bfd25', 'hostname': 'SysClient0968.systemia.com', 'objectID': '515eb9a9-7e63-4821-bc78-a6cde6dc8716', 'object': 'FLOW', 'action': 'INFO', 'actorID': 'e27d5804-74e7-454e-af19-b4501c0e99d2', 'pid': 6012, 'ppid': 6004, 'tid': -1, 'principal': 'SYSTEMIACOM\\fmarisei', 'properties': {'

Then we establish arrays for nodes, edges, and timestamps.

In [4]:
ip_nodes = {}
node_idx = 0

edges = []
timestamps = []

(For this particular dataset, the timestamps are specific and non-standard enough where isoparse has to be used.)

In [5]:
def parse_timestamp(timestamp):
    try:
        return parser.isoparse(timestamp).timestamp()
    except ValueError:
        print(f"Failed to parse timestamp: {timestamp}")
        return None


Otherwise, we set up a graph structure...

In [6]:
for entry in merged_dataset:
    src_ip = entry['properties']['src_ip']
    dest_ip = entry['properties']['dest_ip']
    timestamp = entry['timestamp']
    
    # Assign a node index for each new IP
    if src_ip not in ip_nodes:
        ip_nodes[src_ip] = node_idx
        node_idx += 1
    if dest_ip not in ip_nodes:
        ip_nodes[dest_ip] = node_idx
        node_idx += 1
    
    # Add an edge (src -> dest)
    edges.append([ip_nodes[src_ip], ip_nodes[dest_ip]])
    
    parsed_time = parse_timestamp(timestamp)
    if parsed_time is not None:
        timestamps.append(parsed_time)




And make the graph itself.

In [7]:

edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()  
edge_attr = torch.tensor(timestamps, dtype=torch.float).unsqueeze(1)  

graph = Data(edge_index=edge_index, edge_attr=edge_attr)

print(graph)

Data(edge_index=[2, 4273457], edge_attr=[4273457, 1])


## Step 2: Snapshot Brainstorming

In this section, we are going to first figure out what the best delta for creating snapshots is. This is definitely a subjective question, but we can do a little bit of brainstorming to narrow it down.

Firstly, we can calculate the range of the time for the data.

In [12]:

timestamps = [entry['timestamp'] for entry in merged_dataset]

timestamps = [datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%f%z') if '.' in ts else datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S%z') for ts in timestamps]

min_timestamp = min(timestamps)
max_timestamp = max(timestamps)

print("Earliest timestamp:", min_timestamp)
print("Latest timestamp:", max_timestamp)

Earliest timestamp: 2019-09-24 10:02:30.039000-04:00
Latest timestamp: 2019-09-25 09:04:49.419000-04:00


In this case, we have data over about a days worth of data, so the best first guess would be to get data for every hour, which we can check with the section below.

In [15]:
time_range = max_timestamp - min_timestamp

t = timedelta(hours=1)

num_snapshots = time_range // t

print(f"Time range: {time_range}")
print(f"Number of snapshots: {num_snapshots}")

Time range: 23:02:19.380000
Number of snapshots: 23


This demonstrates we have 23 snapshots worth of data, which may either be perfect, or may be too broad or specific. Let's check with half hours...

In [16]:
time_range = max_timestamp - min_timestamp

t = timedelta(hours=.5)

num_snapshots = time_range // t

print(f"Time range: {time_range}")
print(f"Number of snapshots: {num_snapshots}")

Time range: 23:02:19.380000
Number of snapshots: 46


Going with half-hour intervals gets us more snapshots, which may be better or worse. In our experimental case here, we can go with the safe choice of having a time delta of 1 hour.

# Step 3: Snapshot Graph Creation

With the delta hopefully determined, we can now create our snapshot graphs.

First, use the timestamp range and delta

In [17]:
start_time = min_timestamp
end_time = max_timestamp    
time_delta = timedelta(hours=1)

Then create the snapshots list and note the time

In [18]:
snapshots = []
current_time = start_time

And create a loop that will go through every hour and create a snapshot graph (no need to go into detail on how a graph is made, just carry over how it was done earlier and you are good to go.)

In [19]:
while current_time < end_time:
    next_time = current_time + time_delta
    
    snapshot_data = [entry for entry in merged_dataset if current_time <= datetime.fromisoformat(entry['timestamp']) < next_time]
    
    ip_nodes = {}
    edges = []
    timestamps = []

    for entry in snapshot_data:
        src_ip = entry['properties']['src_ip']
        dest_ip = entry['properties']['dest_ip']
        timestamp = entry['timestamp']

        if src_ip not in ip_nodes:
            ip_nodes[src_ip] = len(ip_nodes)
        if dest_ip not in ip_nodes:
            ip_nodes[dest_ip] = len(ip_nodes)

        edges.append([ip_nodes[src_ip], ip_nodes[dest_ip]])

    
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(timestamps, dtype=torch.float)

    snapshot = Data(edge_index=edge_index, edge_attr=edge_attr)

    snapshots.append(snapshot)

    current_time = next_time


As we will see...

In [21]:
for i in range(len(snapshots)):
    print(f"Snapshot {i+1}")


Snapshot 1
Snapshot 2
Snapshot 3
Snapshot 4
Snapshot 5
Snapshot 6
Snapshot 7
Snapshot 8
Snapshot 9
Snapshot 10
Snapshot 11
Snapshot 12
Snapshot 13
Snapshot 14
Snapshot 15
Snapshot 16
Snapshot 17
Snapshot 18
Snapshot 19
Snapshot 20
Snapshot 21
Snapshot 22
Snapshot 23
Snapshot 24


We have our snapshot graphs.