In [None]:
import os
import time
import numpy as np
from utils import print_, init, close, get_dataset_file_paths, load_models, read_key_files
from timeit import default_timer as timer
init() # Initialize the logging files

In [None]:
def generate_rle_representation(data):
    reshaped = np.reshape(data, newshape=(int(len(data) / 8), 8))

    # Here we take reshaped array and compute the numerical row and column wise gradient and count the number
    # of zeroes in each row. If there are more than 4 zeros which means there is a pattern repeating and
    # is not a key. This is a very conservative estimate for better recall
    num_row = int(len(data) / 8)

    # x_grad = np.abs(np.diff(reshaped.astype(int), axis=1, append=np.zeros((num_row, 1)))).astype(bool)
    # y_grad = np.abs(np.diff(reshaped.astype(int), axis=0, append=np.zeros((1, 8)))).astype(bool)
    # The above numerical gradient computation is transformed into a single step below

    poss_key_locs = (np.count_nonzero(np.abs(np.diff(reshaped.astype(int), axis=1,
                                                     append=np.zeros((num_row, 1)))).astype(bool) &
                                      np.abs(np.diff(reshaped.astype(int), axis=0,
                                                     append=np.zeros((1, 8)))).astype(bool),
                                      axis=1) >= 4).astype(int)  # Changed from 4 to 3 to accommodate for 12 byte keys

    # This part addresses the issue of 12 byte keys. There could be two identical characters next to each other in the
    # last 4 bytes which would make it impossible for a key loc. We modify that if there is a possibility for a key
    idx = 1
    while idx < len(poss_key_locs):
        # Last 4 characters must be zeros and first four should have at least 3 unique characters
        if poss_key_locs[idx] == 0 and poss_key_locs[idx-1] == 1 and \
                all(reshaped[idx][4:]) == 0 and len(set(reshaped[idx][:4])) > 2:
            poss_key_locs[idx] = 1
        idx += 1

    # Roll the data to the left
    rolled = np.roll(poss_key_locs, -1)
    # The key cannot start at the last byte and then the block contain the whole key.
    # So the last value is set to False
    rolled[-1] = False
    poss_key_locs = (poss_key_locs & rolled).astype(int)

    # Roll right and OR it. The whole operation is similar to the opening morphological operation
    rolled = np.roll(poss_key_locs, 1)
    rolled[0] = False

    poss_key_locs = poss_key_locs | rolled

    characters, counts = get_run_length_encoded(poss_key_locs)

    cum_sum = [0]

    for idx in range(len(counts)):
        cum_sum.append(cum_sum[idx] + counts[idx])

    cum_sum = [x * 8 for x in cum_sum]

    # The last offset is not required for the cumulative sum
    return characters, counts, cum_sum[:-1]

In [None]:
def get_slices(data, offsets, keys, max_key_size=128):
    data_blocks = []
    labels = []
    last_frame_added = False
    key_count = [0] * len(keys)
    for offset in offsets:
        if offset + max_key_size > len(data):
            curr_data = data[-max_key_size:]
            last_frame_added = True
        else:
            curr_data = data[offset:offset+max_key_size]
        data_blocks.append(curr_data)

        found = [l_idx for l_idx in range(len(keys)) if keys[l_idx] in curr_data]

        if len(found) > 0:
            labels.append(1)
            for key_idx in set(found):
                key_count[key_idx] += 1

        else:
            labels.append(0)

        if last_frame_added is True:
            break

    assert len(data_blocks) == len(labels)
    assert sum(labels) > 0 and sum(labels) >= len(keys)
    assert min(key_count) != 0
    return data_blocks, labels

In [None]:
def build_encoded_dataset(heap_paths, key_paths, max_key_size=128, deploy=False):

    dataset = []
    labels = []

    for heap_path, key_path in zip(heap_paths, key_paths):

        # Check if the key path corresponds to the heap path, then read the required data
        if deploy is False:
            assert (key_path[:-5] in heap_path)
            curr_keys = read_key_files(key_path)
            # Remove repeated keys. This is an issue for some older versions of OpenSSH
            curr_keys = list(map(bytearray, set(tuple(x) for x in curr_keys)))

        with open(heap_path, "rb") as fp:
            data = bytearray(fp.read())

        characters, counts, cum_sum = generate_rle_representation(data)

        viable_offsets = [cum_sum[idx] for idx in range(len(cum_sum)) if characters[idx] == 1]
        slices, curr_labels = get_slices(data=data, offsets=viable_offsets, max_key_size=max_key_size, keys=curr_keys)
        dataset = dataset + slices
        labels = labels + curr_labels

    assert len(labels) == len(dataset)
    return dataset, labels

In [None]:
def generate_probable_slices(clf, heap_paths, key_paths, root_dir):
    # Sort the heap paths and key paths, so it easier to group them by version and key length
    heap_paths.sort()
    key_paths.sort()
    
    # For each of the keys
    for idx in range(len(key_paths)):
        start = time.time()
        dataset, curr_labels = build_encoded_dataset(heap_paths=[heap_paths[idx]],
                                                     key_paths=[key_paths[idx]])

        x_test = np.array(dataset).astype(int)
        curr_pred = clf.predict(x_test)

        path_idx = key_paths[idx].rfind("/")
        file_name = key_paths[idx][path_idx+1:-4] + "txt"
        sub_dir = key_paths[idx][len(root_dir)+1:path_idx]
        dir_path = os.path.join(WRITE_PATH, sub_dir)
        path = os.path.join(dir_path, file_name)
        if os.path.exists(dir_path) is False:
            os.makedirs(dir_path)

        with open(path, 'w') as fp:
            for inner_idx, pred in enumerate(curr_pred):
                if pred == 0:
                    continue

                temp = ''.join(format(x, '02x') for x in dataset[inner_idx])
                fp.write(temp + "\n")

        end = time.time()
        print_('Total time taken for file %s: %f' % (heap_paths[idx], (end - start)))

In [None]:
def get_run_length_encoded(data_block):

    idx = 1
    characters = []
    counts = []
    count = 1
    curr_char = data_block[0]
    while idx < len(data_block):
        if data_block[idx] == curr_char:
            idx += 1
            count += 1
            continue

        else:
            characters.append(curr_char)
            counts.append(count)

            count = 1
            curr_char = data_block[idx]

        idx += 1

    # Append the last character and count
    characters.append(curr_char)
    counts.append(count)

    return bytearray(characters), counts

In [None]:
TEST = '/Users/stewartsentanoe/Documents/Uni Passau/projects/SmartVMI/Use Case 1 Data/Performance Test/V_7_1_P1/aes128-ctr'  # Path to the dataset
WRITE_PATH = '/tmp/output/' # Output file path

# Load the models
clf = load_models(load_high_recall_only=True)

In [None]:
# Search for all the files with in the test directory
# start = time.time()
start = timer()
heap_paths, key_paths = get_dataset_file_paths(TEST)
end = timer()
# end = time.time()
print_('Time taken for finding all files: %f' % (end - start))




In [None]:
generate_probable_slices(clf=clf, heap_paths=heap_paths, key_paths=key_paths, root_dir=TEST)

In [None]:
close() # Close the logging file pointers