In [1]:
from pathlib import Path
import json
import numpy as np
import gc

In [2]:
input_json_path = "./data/json/vocab.json"
try:
    with open(input_json_path, 'r') as json_file:
        system_calls_json = json.load(json_file)
        json_file.close()
except FileNotFoundError:
    print("The file does not exist")
except json.JSONDecodeError:
    print("Error: Failed to decode JSON from the file.")
finally:
    if json_file:
        json_file.close()

In [3]:
normal_input_dir_paths = ["./data/Normal_data/glibc_2884",
                            "./data/Normal_data/ltp_2561",
                            "./data/Normal_data/kselftest_170",
                            "./data/Normal_data/posixtest_1235"]

In [4]:
abnormal_input_dir_paths = ["./data/Abnormal_data/kernel_v5110-536",
                            "./data/Abnormal_data/kernel_v520-729",
                            "./data/Abnormal_data/kernel_v580-313"]

In [5]:
def data_generator_normal(dir_paths, json_map, L, batch_size=32):
    for dir_path in dir_paths:
        dir_path = Path(dir_path)
        for file_path in dir_path.iterdir():
            if not file_path.is_file(): continue
            
            try:
                with open(file_path, 'r') as file:
                    system_calls = file.read().split("|")
                    
                    list_X = []
                    list_y = []
                    batch_X = []
                    batch_y = []

                    for sc in system_calls:
                        if sc in json_map:
                            if len(list_X) < L:
                                list_X.append(json_map[sc])
                                list_y.append(0)
                            
                            if len(list_X) == L:
                                batch_X.append(list_X)
                                batch_y.append(list_y)
                                list_X = []
                                list_y = []

                                # Yield a batch when full
                                if len(batch_X) == batch_size:
                                    yield np.array(batch_X, dtype=np.int32), np.array(batch_y, dtype=np.int32)
                                    batch_X = []
                                    batch_y = []

                    # Handle leftovers
                    if 0 < len(list_X) < L:
                        padding = L - len(list_X)
                        list_X.extend([0] * padding)
                        list_y.extend([0] * padding)
                        batch_X.append(list_X)
                        batch_y.append(list_y)
                    
                    # Yield remaining partial batch
                    if len(batch_X) > 0:
                        yield np.array(batch_X, dtype=np.int32), np.array(batch_y, dtype=np.int32)

            except Exception as e:
                print(f"Error: {e}")
                continue

In [6]:
def data_generator_abnormal(dir_paths, json_map, L, batch_size=32):
    for dir_path in dir_paths:
        dir_path = Path(dir_path)
        for file_path in dir_path.iterdir():
            if not file_path.is_file(): continue
            
            try:
                with open(file_path, 'r') as file:
                    system_calls = file.read().split("|")
                    
                    list_X = []
                    list_y = []
                    batch_X = []
                    batch_y = []

                    for sc in system_calls:
                        if sc in json_map:
                            if len(list_X) < L:
                                list_X.append(json_map[sc])
                                list_y.append(1)
                            
                            if len(list_X) == L:
                                batch_X.append(list_X)
                                batch_y.append(list_y)
                                list_X = []
                                list_y = []

                                # Yield a batch when full
                                if len(batch_X) == batch_size:
                                    yield np.array(batch_X, dtype=np.int32), np.array(batch_y, dtype=np.int32)
                                    batch_X = []
                                    batch_y = []

                    # Handle leftovers
                    if 0 < len(list_X) < L:
                        padding = L - len(list_X)
                        list_X.extend([0] * padding)
                        list_y.extend([1] * padding)
                        batch_X.append(list_X)
                        batch_y.append(list_y)
                    
                    # Yield remaining partial batch
                    if len(batch_X) > 0:
                        yield np.array(batch_X, dtype=np.int32), np.array(batch_y, dtype=np.int32)

            except Exception as e:
                print(f"Error: {e}")
                continue

In [7]:
X_normal = []; y_normal = []
X_abnormal = []; y_abnormal = []

In [8]:
# 2. Loop through the generator
for batch_X, batch_y in data_generator_normal(normal_input_dir_paths, system_calls_json, L=100):
    X_normal.append(batch_X)
    y_normal.append(batch_y)
    
    # Optional: Print progress to see if RAM is filling up
    print(f"Processed batch. Total batches so far: {len(X_normal)}")

# 3. Concatenate all batches into one giant NumPy array
try:
    final_X_np_normal = np.vstack(X_normal)
    final_y_np_normal = np.vstack(y_normal)
    
    print(f"Success! Shape of X: {final_X_np_normal.shape}")
    print(f"Success! Shape of y: {final_y_np_normal.shape}")

except MemoryError:
    print("CRASH: Not enough RAM to combine all batches into one array.")

del X_normal
del y_normal
gc.collect() # Force cleanup
print("Normal data processed and memory freed.")

Processed batch. Total batches so far: 1
Processed batch. Total batches so far: 2
Processed batch. Total batches so far: 3
Processed batch. Total batches so far: 4
Processed batch. Total batches so far: 5
Processed batch. Total batches so far: 6
Processed batch. Total batches so far: 7
Processed batch. Total batches so far: 8
Processed batch. Total batches so far: 9
Processed batch. Total batches so far: 10
Processed batch. Total batches so far: 11
Processed batch. Total batches so far: 12
Processed batch. Total batches so far: 13
Processed batch. Total batches so far: 14
Processed batch. Total batches so far: 15
Processed batch. Total batches so far: 16
Processed batch. Total batches so far: 17
Processed batch. Total batches so far: 18
Processed batch. Total batches so far: 19
Processed batch. Total batches so far: 20
Processed batch. Total batches so far: 21
Processed batch. Total batches so far: 22
Processed batch. Total batches so far: 23
Processed batch. Total batches so far: 24
P

In [9]:
for batch_X, batch_y in data_generator_normal(abnormal_input_dir_paths, system_calls_json, L=100):
    X_abnormal.append(batch_X)
    y_abnormal.append(batch_y)
    
    # Optional: Print progress to see if RAM is filling up
    print(f"Processed batch. Total batches so far: {len(X_abnormal)}")

# 3. Concatenate all batches into one giant NumPy array
try:
    final_X_np_abnormal = np.vstack(X_abnormal)
    final_y_np_abnormal = np.vstack(y_abnormal)
    
    print(f"Success! Shape of X: {final_X_np_abnormal.shape}")
    print(f"Success! Shape of y: {final_y_np_abnormal.shape}")

except MemoryError:
    print("CRASH: Not enough RAM to combine all batches into one array.")

del X_abnormal
del y_abnormal
gc.collect()
print("Abnormal data processed and memory freed.")

Processed batch. Total batches so far: 1
Processed batch. Total batches so far: 2
Processed batch. Total batches so far: 3
Processed batch. Total batches so far: 4
Processed batch. Total batches so far: 5
Processed batch. Total batches so far: 6
Processed batch. Total batches so far: 7
Processed batch. Total batches so far: 8
Processed batch. Total batches so far: 9
Processed batch. Total batches so far: 10
Processed batch. Total batches so far: 11
Processed batch. Total batches so far: 12
Processed batch. Total batches so far: 13
Processed batch. Total batches so far: 14
Processed batch. Total batches so far: 15
Processed batch. Total batches so far: 16
Processed batch. Total batches so far: 17
Processed batch. Total batches so far: 18
Processed batch. Total batches so far: 19
Processed batch. Total batches so far: 20
Processed batch. Total batches so far: 21
Processed batch. Total batches so far: 22
Processed batch. Total batches so far: 23
Processed batch. Total batches so far: 24
P

In [10]:
try:
    final_X_np = np.vstack((final_X_np_normal, final_X_np_abnormal))
    final_y_np = np.vstack((final_y_np_normal, final_y_np_abnormal))
    
    print(f"Success! Shape of X: {final_X_np.shape}")
    print(f"Success! Shape of y: {final_y_np.shape}")

except MemoryError:
    print("CRASH: Not enough RAM to combine all batches into one array.")

del final_X_np_normal
del final_X_np_abnormal
del final_y_np_normal
del final_y_np_abnormal
gc.collect()

Success! Shape of X: (1769598, 100)
Success! Shape of y: (1769598, 100)


0

In [11]:
X_numpy_path = "./data/numpy/dataset_X.npy"
y_numpy_path = "./data/numpy/dataset_y.npy"

np.save(X_numpy_path, final_X_np)
np.save(y_numpy_path, final_y_np)