In [1]:
# First, the necessary modules are imported.
import json
from torch_geometric.data import Data
import torch
import os
import re
import networkx as nx
import matplotlib.pyplot as plt

# Section 1: JSON Processing

Before we build out our dataset in pytorch_geo, we need to make sure our data makes sense contextually.
We first start by extracting the relevant features of the JSON data, including calculating file depth.

In [2]:
# First, we need to grab the features from our JSON data.
def extract_features(json_obj):
    # Of particular intrest to us is the Unix time an action is run, the file object that is acted upon, and the path of the file object.
    unix_time = json_obj["unix_time"]
    file_object = json_obj["object"]
    path = json_obj["path"]
    # As a bonus, we also want to grab file depth.
    depth = path.count('\\')
    # We then return these features for use.
    return {
        "Unix Time" : unix_time,
        "File Object": path + "\\" + file_object,
        "Depth": depth
    }

In [3]:
# This method reads in log files through JSON and makes sure the features are extracted for edges.
def process_log_file(file_path):
    # We first create a dictionary to store the edges.
    log_edges = {}
    # We then open the JSON file and its data...
    with open(file_path) as f:
        json_data = json.load(f)
    # Then grab the features from the data.
    for key, json_obj in json_data.items():
        log_edges[key] = extract_features(json_obj)
    # and return the edges.
    return log_edges

In [4]:
# This method helps us create easier to use keys.
def extract_user_number(file_path):
    start_index = file_path.find('User') + len('User')
    end_index = file_path.find('/User Log')
    user_number = file_path[start_index:end_index]
    return f'User{user_number}'

## Section 1.1: Action Logs

In [5]:
# This method processes file files, creating a dictionary to hold the edge data of all the logs.
def process_logs_directory(directory_path):
    # We first create a dictionary to store all the logs.
    all_logs_data = {}
    total_files = 0
    processed_files = 0
    # Then, we walk through the directory specified...
    for root, dirs, files in os.walk(directory_path):
        for dir_name in dirs:
            # For initial testing, only benign user logs were included. This will be changed upon verification.
            if dir_name == "User Log":
                # We get the path of the user log directory...
                user_log_dir = os.path.join(root, dir_name)
                # Then for all of the user logs...
                for user_root, user_dirs, user_files in os.walk(user_log_dir):
                    # We check for our updated log files...
                    for file in user_files:
                        if file == "updated_log.json":
                            # And then process the logs to get our data in python.
                            file_path = os.path.join(user_root, file)
                            extracted_data = process_log_file(file_path)
                            key = extract_user_number(file_path)
                            all_logs_data[key] = extracted_data
                            total_files += 1
    for file_path, extracted_data in all_logs_data.items():
        processed_files += 1
        print(f"Processed {processed_files}/{total_files} files: {file_path}")
    # And finally return all of the logs in a large dictionary.
    return all_logs_data

In [6]:
# We then process the logs our log directory.
logs_directory = 'WUIL Logs'
all_logs_data = process_logs_directory(logs_directory)

Processed 1/74 files: User4
Processed 2/74 files: User68
Processed 3/74 files: User45
Processed 4/74 files: User48
Processed 5/74 files: User71
Processed 6/74 files: User27
Processed 7/74 files: User38
Processed 8/74 files: User12
Processed 9/74 files: User20
Processed 10/74 files: User49
Processed 11/74 files: User61
Processed 12/74 files: User3
Processed 13/74 files: User60
Processed 14/74 files: User28
Processed 15/74 files: User1
Processed 16/74 files: User40
Processed 17/74 files: User50
Processed 18/74 files: User59
Processed 19/74 files: User29
Processed 20/74 files: User69
Processed 21/74 files: User8
Processed 22/74 files: User41
Processed 23/74 files: User7
Processed 24/74 files: User36
Processed 25/74 files: User33
Processed 26/74 files: User47
Processed 27/74 files: User72
Processed 28/74 files: User56
Processed 29/74 files: User19
Processed 30/74 files: User73
Processed 31/74 files: User35
Processed 32/74 files: User53
Processed 33/74 files: User54
Processed 34/74 files: U

As we can see below, we can easily get all of the data for our edges, those being the time an event occurred, the index of the event, and the file depth of the event.

In [7]:
# And make sure those logs were properly processed.
print(list(all_logs_data.keys()))

['User4', 'User68', 'User45', 'User48', 'User71', 'User27', 'User38', 'User12', 'User20', 'User49', 'User61', 'User3', 'User60', 'User28', 'User1', 'User40', 'User50', 'User59', 'User29', 'User69', 'User8', 'User41', 'User7', 'User36', 'User33', 'User47', 'User72', 'User56', 'User19', 'User73', 'User35', 'User53', 'User54', 'User63', 'User15', 'User34', 'User51', 'User30', 'User14', 'User65', 'User24', 'User74', 'User57', 'User22', 'User25', 'User46', 'User2', 'User58', 'User16', 'User32', 'User70', 'User37', 'User66', 'User5', 'User44', 'User11', 'User18', 'User76', 'User39', 'User75', 'User67', 'User64', 'User6', 'User52', 'User62', 'User42', 'User55', 'User21', 'User9', 'User17', 'User26', 'User23', 'User31', 'User43']


In [8]:
# Some additional verification that User1 has all of their actions logged.
len(all_logs_data['User1'])

274052

In [9]:
# And we print the data from User4 to see if it is properly formatted.
all_logs_data['User4']

{'0': {'Unix Time': 1329245499.0, 'File Object': '0\\1\\2\\3\\3', 'Depth': 3},
 '1': {'Unix Time': 1329245499.0, 'File Object': '0\\1\\2\\3\\3', 'Depth': 3},
 '2': {'Unix Time': 1329245499.0,
  'File Object': '0\\1\\2\\3\\6\\4',
  'Depth': 4},
 '3': {'Unix Time': 1329245499.0,
  'File Object': '0\\1\\2\\3\\6\\4',
  'Depth': 4},
 '4': {'Unix Time': 1329245499.0,
  'File Object': '0\\1\\2\\3\\7\\4',
  'Depth': 4},
 '5': {'Unix Time': 1329245499.0,
  'File Object': '0\\1\\2\\8\\9\\4',
  'Depth': 4},
 '6': {'Unix Time': 1329245499.0,
  'File Object': '0\\1\\2\\8\\9\\4',
  'Depth': 4},
 '7': {'Unix Time': 1329245499.0,
  'File Object': '0\\1\\2\\8\\9\\4',
  'Depth': 4},
 '8': {'Unix Time': 1329245499.0,
  'File Object': '0\\1\\2\\8\\9\\4',
  'Depth': 4},
 '9': {'Unix Time': 1329245499.0,
  'File Object': '0\\1\\2\\8\\9\\4',
  'Depth': 4},
 '10': {'Unix Time': 1329245499.0,
  'File Object': '0\\1\\2\\8\\9\\4',
  'Depth': 4},
 '11': {'Unix Time': 1329245499.0,
  'File Object': '0\\1\\2\\3\\10

## Section 1.2: File System Log

Now that edges have been created, we can also get the node data needed as well.

In [10]:
# The same idea of reading from JSON applies...
def extract_node_data_from_log(file_path):
    nodes = {}
    with open(file_path) as f:
        json_data = json.load(f)
        for key, json_obj in json_data.items():
            # Altough this time we are only grabbing the file object # and the unix time of its first creation.
            file_object = json_obj["object"]
            path = json_obj["path"]
            unix_time = json_obj["unix_time"]
            full_object = path + "\\" + file_object
            # As checked by this if statement.
            if file_object not in nodes:
                nodes[full_object] = unix_time
    return nodes

In [11]:
#  Then processing all of the logs in a similar fashion.
def process_all_user_logs(logs_directory):
    all_nodes_data = {}
    for user_dir in os.listdir(logs_directory):
        user_logs_dir = os.path.join(logs_directory, user_dir, "User Log")
        if os.path.isdir(user_logs_dir):
            user_nodes_data = {}
            for file_name in os.listdir(user_logs_dir):
                if file_name == "updated_log.json":
                    file_path = os.path.join(user_logs_dir, file_name)
                    nodes_data = extract_node_data_from_log(file_path)
                    user_nodes_data.update(nodes_data)
            all_nodes_data[user_dir] = user_nodes_data
    return all_nodes_data

In [12]:
logs_directory = 'WUIL Logs'
all_nodes_data = process_all_user_logs(logs_directory)

An important detail to note is that we have the same keys as our edge dictionary, so that we can properly merge the data later.

In [13]:
print(list(all_nodes_data.keys()))

['User4', 'User68', 'User45', 'User48', 'User71', 'User27', 'User38', 'User12', 'User20', 'User49', 'User61', 'User3', 'User60', 'User28', 'User1', 'User40', 'User50', 'User59', 'User29', 'User69', 'User8', 'User41', 'User7', 'User36', 'User33', 'User47', 'User72', 'User56', 'User19', 'User73', 'User35', 'User53', 'User54', 'User63', 'User15', 'User34', 'User51', 'User30', 'User14', 'User65', 'User24', 'User74', 'User57', 'User22', 'User25', 'User46', 'User2', 'User58', 'User16', 'User13 (NO DATA)', 'User32', 'User70', 'User37', 'User66', 'User5', 'User44', 'User11', 'User18', 'User76', 'User39', 'User75', 'User10 (NO DATA)', 'User67', 'User64', 'User6', 'User52', 'User62', 'User42', 'User55', 'User21', 'User9', 'User17', 'User26', 'User23', 'User31', 'User43']


In [14]:
all_nodes_data['User4']

{'0\\1\\2\\3\\3': 1332550454.0,
 '0\\1\\2\\3\\6\\4': 1332526616.0,
 '0\\1\\2\\3\\7\\4': 1332526616.0,
 '0\\1\\2\\8\\9\\4': 1332550483.0,
 '0\\1\\2\\3\\10\\4': 1332526616.0,
 '0\\1\\2\\3\\11\\4': 1332526616.0,
 '0\\1\\2\\8\\3': 1332550483.0,
 '0\\1\\2\\8\\12\\9\\5': 1332550454.0,
 '0\\1\\2\\8\\13\\9\\5': 1332550454.0,
 '0\\1\\2\\3\\18\\4': 1332527148.0,
 '0\\1\\2\\8\\19\\9\\5': 1332550433.0,
 '0\\1\\2\\8\\20\\4': 1330459838.0,
 '0\\1\\2\\8\\21\\4': 1331169656.0,
 '0\\1\\2\\8\\22\\4': 1331941230.0,
 '0\\1\\2\\8\\23\\4': 1331242721.0,
 '0\\1\\2\\8\\24\\4': 1331242721.0,
 '0\\1\\2\\8\\25\\4': 1331242721.0,
 '0\\1\\2\\8\\26\\4': 1332527313.0,
 '0\\1\\2\\8\\27\\4': 1331319414.0,
 '0\\1\\2\\8\\28\\4': 1331575423.0,
 '0\\1\\2\\8\\29\\4': 1332527308.0,
 '0\\1\\2\\8\\30\\4': 1331169667.0,
 '0\\1\\2\\8\\31\\4': 1331169669.0,
 '0\\1\\2\\8\\32\\4': 1331169680.0,
 '0\\1\\2\\8\\33\\4': 1331169688.0,
 '0\\1\\2\\8\\34\\4': 1332527362.0,
 '0\\1\\2\\8\\35\\4': 1331231293.0,
 '0\\1\\2\\8\\36\\4': 13325273

# Section 2: Graph Creation

Now that we have effectively imported the data, now we create graphs of the data.

In [15]:
# First, we need to be able to know what the numeric part of a node is.
def extract_numeric_part(node):
    numeric_part = re.findall(r'\d+$', node)
    if numeric_part:
        return int(numeric_part[0])
    else:
        return None
# Then, we can create the graphs.
def create_graphs(all_logs_data, all_nodes_data):
    # We create a dictionary to store the graphs, as well as a map to store the file index.
    graphs = []
    file_index_map = {}
    next_index = 0
    # For all of the users in the logs...
    for user, nodes_data in all_nodes_data.items():
        if user in all_logs_data:
            # We create dictionaries to store the edge index, edge attributes, node timestamps, and node indices.
            edge_index = []
            edge_attr = []
            node_timestamps = []
            node_indices = []
            # In the graph, the edges represent the actions taken on the files, while nodes are files themselves.
            prev_file_index = None
            # For each action in the logs...
            for action_index, action_data in all_logs_data[user].items():
                # We record the file path and index it into a map.
                file_path = action_data['File Object']
                # If the path is already in the map, we grab the index.
                if file_path in file_index_map:
                    file_index = file_index_map[file_path]
                # Otherwise, we add the path to the map and grab the index.
                else:
                    file_index_map[file_path] = next_index
                    file_index = next_index
                    next_index += 1
                # We also record the depth of the file.
                depth = action_data['Depth']
                # And then, we record the edge index and attributes, in this case the previous file index, the current file index, the depth, and the Unix time.
                if prev_file_index is not None:
                    edge_index.append([prev_file_index, file_index])
                    edge_attr.append([action_data['Unix Time'], prev_file_index, file_index, depth])
                # And then we update the previous file index.
                prev_file_index = file_index
            # Finally, we record the node timestamps and indices.
            for node, timestamp in nodes_data.items():
                node_index = extract_numeric_part(node)
                if node_index is not None:
                    node_indices.append(node_index)
                    node_timestamps.append(timestamp)
            # And then we create the graph with pytorch geometric tensors.
            edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
            edge_attr = torch.tensor(edge_attr, dtype=torch.float)
            node_timestamps = torch.tensor(node_timestamps, dtype=torch.float)
            node_indices = torch.tensor(node_indices, dtype=torch.long)
            # Save the data to a graph object.
            data = Data(edge_index=edge_index, edge_attr=edge_attr, x=node_timestamps, t=node_indices)
            graphs.append(data)
    # and return the graphs.
    return graphs


In [16]:
# We then create our graphs...
graphs = create_graphs(all_logs_data, all_nodes_data)

# And print a summary to make sure they are properly implemented
for i, graph in enumerate(graphs):
    num_nodes = graph.num_nodes
    num_edges = graph.num_edges
    print(f"Graph {i+1}: Number of nodes: {num_nodes}, Number of edges: {num_edges}")

Graph 1: Number of nodes: 2689, Number of edges: 130614
Graph 2: Number of nodes: 3538, Number of edges: 227621
Graph 3: Number of nodes: 9015, Number of edges: 220375
Graph 4: Number of nodes: 8981, Number of edges: 246342
Graph 5: Number of nodes: 5874, Number of edges: 227056
Graph 6: Number of nodes: 5048, Number of edges: 778115
Graph 7: Number of nodes: 423, Number of edges: 13430
Graph 8: Number of nodes: 6093, Number of edges: 199811
Graph 9: Number of nodes: 1502, Number of edges: 57917
Graph 10: Number of nodes: 97, Number of edges: 475
Graph 11: Number of nodes: 15884, Number of edges: 851724
Graph 12: Number of nodes: 208, Number of edges: 15236
Graph 13: Number of nodes: 3419, Number of edges: 120220
Graph 14: Number of nodes: 17732, Number of edges: 633815
Graph 15: Number of nodes: 8540, Number of edges: 274051
Graph 16: Number of nodes: 1275, Number of edges: 138511
Graph 17: Number of nodes: 6993, Number of edges: 376373
Graph 18: Number of nodes: 53386, Number of edge

In [17]:
# We then create a dataset...
from torch_geometric.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, graphs):
        super().__init__()
        self.graphs = graphs
    def len(self):
        return len(self.graphs)
    def get(self, idx):
        return self.graphs[idx]
dataset = CustomDataset(graphs)

In [18]:
# Save the dataset to a pytorch file.
torch.save(dataset, 'WUIL_Dataset.pt')
# as well as a CSV file.
import pandas as pd
df = pd.DataFrame([graph.num_nodes for graph in graphs], columns=['Number of Nodes'])
df.to_csv('WUIL_Dataset.csv', index=False)

# Section 3: File Trees

In [23]:
def build_tree(logs):
    tree = {}
    level_indexes = {}
    for path, _ in sorted(logs.items()): 
        components = path.split('\\')
        current_node = tree
        for i, component in enumerate(components):
            # Create a unique key for the level index based on the level depth
            level_key = i
            # Assign index based on the level
            level_index = level_indexes.get(level_key, 0)
            # If it's the last component (file), add it without children
            if i == len(components) - 1:
                current_node[component] = {'index': level_index}
            else:
                current_node[component] = current_node.get(component, {'index': level_index, 'children': {}})
            # Increment the level index
            level_indexes[level_key] = level_index + 1
            # Move to the next level in the tree if it's not a file
            if 'children' in current_node[component]:
                current_node = current_node[component]['children']
    return tree

In [24]:
import os
import json

# Specify the folder name
folder_name = 'trees'

# Create the folder if it doesn't exist
os.makedirs(folder_name, exist_ok=True)

for user, data in all_nodes_data.items():
    tree = build_tree(data)
    # Include the folder name in the file path
    with open(os.path.join(folder_name, f'{user}_tree.json'), 'w') as f:
        json.dump(tree, f)

In [25]:
def print_tree(tree, indent=0):
    for key, value in tree.items():
        print('  ' * indent + f'[{key}] (index: {value["index"]})')
        if 'children' in value:
            print_tree(value['children'], indent + 1)

print_tree(tree)

[0] (index: 0)
  [1] (index: 0)
    [2] (index: 0)
      [11] (index: 0)
        [112] (index: 0)
          [112] (index: 0)
            [4831] (index: 0)
              [6] (index: 0)
            [4832] (index: 1)
              [6] (index: 1)
            [4833] (index: 2)
              [6] (index: 2)
            [4834] (index: 3)
              [6] (index: 3)
            [4835] (index: 4)
              [6] (index: 4)
            [4836] (index: 5)
              [6] (index: 5)
            [4837] (index: 6)
              [6] (index: 6)
            [4838] (index: 7)
              [6] (index: 7)
            [4839] (index: 8)
              [6] (index: 8)
            [4840] (index: 9)
              [4548] (index: 9)
                [7] (index: 0)
              [4569] (index: 10)
                [7] (index: 1)
              [4668] (index: 11)
                [7] (index: 2)
              [4674] (index: 12)
                [7] (index: 3)
              [4683] (index: 13)
                [7] (index

In [26]:
def print_indexes(tree, indent=0):
    for value in tree.values():
        if isinstance(value, dict):
            print('  ' * indent + f'(index: {value["index"]})')
            if 'children' in value:
                print_indexes(value['children'], indent + 1)

print_indexes(tree)

(index: 0)
  (index: 0)
    (index: 0)
      (index: 0)
        (index: 0)
          (index: 0)
            (index: 0)
              (index: 0)
            (index: 1)
              (index: 1)
            (index: 2)
              (index: 2)
            (index: 3)
              (index: 3)
            (index: 4)
              (index: 4)
            (index: 5)
              (index: 5)
            (index: 6)
              (index: 6)
            (index: 7)
              (index: 7)
            (index: 8)
              (index: 8)
            (index: 9)
              (index: 9)
                (index: 0)
              (index: 10)
                (index: 1)
              (index: 11)
                (index: 2)
              (index: 12)
                (index: 3)
              (index: 13)
                (index: 4)
              (index: 14)
                (index: 5)
              (index: 15)
                (index: 6)
              (index: 16)
                (index: 7)
              (index: 17)
