In [1]:
import os
import time
import numpy as np
from utils import print_, init, close, get_dataset_file_paths, load_models, read_key_files
init() # Initialize the logging files

In [2]:
def generate_rle_representation(data):
    reshaped = np.reshape(data, newshape=(int(len(data) / 8), 8))

    # Here we take reshaped array and compute the numerical row and column wise gradient and count the number
    # of zeroes in each row. If there are more than 4 zeros which means there is a pattern repeating and
    # is not a key. This is a very conservative estimate for better recall
    num_row = int(len(data) / 8)

    # x_grad = np.abs(np.diff(reshaped.astype(int), axis=1, append=np.zeros((num_row, 1)))).astype(bool)
    # y_grad = np.abs(np.diff(reshaped.astype(int), axis=0, append=np.zeros((1, 8)))).astype(bool)
    # The above numerical gradient computation is transformed into a single step below

    poss_key_locs = (np.count_nonzero(np.abs(np.diff(reshaped.astype(int), axis=1,
                                                     append=np.zeros((num_row, 1)))).astype(bool) &
                                      np.abs(np.diff(reshaped.astype(int), axis=0,
                                                     append=np.zeros((1, 8)))).astype(bool),
                                      axis=1) >= 4).astype(int)  # Changed from 4 to 3 to accommodate for 12 byte keys

    # This part addresses the issue of 12 byte keys. There could be two identical characters next to each other in the
    # last 4 bytes which would make it impossible for a key loc. We modify that if there is a possibility for a key
    idx = 1
    while idx < len(poss_key_locs):
        # Last 4 characters must be zeros and first four should have at least 3 unique characters
        if poss_key_locs[idx] == 0 and poss_key_locs[idx-1] == 1 and \
                all(reshaped[idx][4:]) == 0 and len(set(reshaped[idx][:4])) > 2:
            poss_key_locs[idx] = 1
        idx += 1

    # Roll the data to the left
    rolled = np.roll(poss_key_locs, -1)
    # The key cannot start at the last byte and then the block contain the whole key.
    # So the last value is set to False
    rolled[-1] = False
    poss_key_locs = (poss_key_locs & rolled).astype(int)

    # Roll right and OR it. The whole operation is similar to the opening morphological operation
    rolled = np.roll(poss_key_locs, 1)
    rolled[0] = False

    poss_key_locs = poss_key_locs | rolled

    characters, counts = get_run_length_encoded(poss_key_locs)

    cum_sum = [0]

    for idx in range(len(counts)):
        cum_sum.append(cum_sum[idx] + counts[idx])

    cum_sum = [x * 8 for x in cum_sum]

    # The last offset is not required for the cumulative sum
    return characters, counts, cum_sum[:-1]

In [3]:
def get_slices(data, offsets, keys, max_key_size=128):
    data_blocks = []
    labels = []
    last_frame_added = False
    key_count = [0] * len(keys)
    for offset in offsets:
        if offset + max_key_size > len(data):
            curr_data = data[-max_key_size:]
            last_frame_added = True
        else:
            curr_data = data[offset:offset+max_key_size]
        data_blocks.append(curr_data)

        found = [l_idx for l_idx in range(len(keys)) if keys[l_idx] in curr_data]

        if len(found) > 0:
            labels.append(1)
            for key_idx in set(found):
                key_count[key_idx] += 1

        else:
            labels.append(0)

        if last_frame_added is True:
            break

    assert len(data_blocks) == len(labels)
    assert sum(labels) > 0 and sum(labels) >= len(keys)
    assert min(key_count) != 0
    return data_blocks, labels

In [4]:
def build_encoded_dataset(heap_paths, key_paths, max_key_size=128, deploy=False):

    dataset = []
    labels = []

    for heap_path, key_path in zip(heap_paths, key_paths):

        # Check if the key path corresponds to the heap path, then read the required data
        if deploy is False:
            assert (key_path[:-5] in heap_path)
            curr_keys = read_key_files(key_path)
            # Remove repeated keys. This is an issue for some older versions of OpenSSH
            curr_keys = list(map(bytearray, set(tuple(x) for x in curr_keys)))

        with open(heap_path, "rb") as fp:
            data = bytearray(fp.read())

        characters, counts, cum_sum = generate_rle_representation(data)

        viable_offsets = [cum_sum[idx] for idx in range(len(cum_sum)) if characters[idx] == 1]
        slices, curr_labels = get_slices(data=data, offsets=viable_offsets, max_key_size=max_key_size, keys=curr_keys)
        dataset = dataset + slices
        labels = labels + curr_labels

    assert len(labels) == len(dataset)
    return dataset, labels

In [5]:
def generate_probable_slices(clf, heap_paths, key_paths, root_dir):
    # Sort the heap paths and key paths, so it easier to group them by version and key length
    heap_paths.sort()
    key_paths.sort()
    
    # For each of the keys
    for idx in range(len(key_paths)):
        start = time.time()
        dataset, curr_labels = build_encoded_dataset(heap_paths=[heap_paths[idx]],
                                                     key_paths=[key_paths[idx]])

        x_test = np.array(dataset).astype(int)
        curr_pred = clf.predict(x_test)

        path_idx = key_paths[idx].rfind("/")
        file_name = key_paths[idx][path_idx+1:-4] + "txt"
        sub_dir = key_paths[idx][len(root_dir)+1:path_idx]
        dir_path = os.path.join(WRITE_PATH, sub_dir)
        path = os.path.join(dir_path, file_name)
        if os.path.exists(dir_path) is False:
            os.makedirs(dir_path)

        with open(path, 'w') as fp:
            for inner_idx, pred in enumerate(curr_pred):
                if pred == 0:
                    continue

                temp = ''.join(format(x, '02x') for x in dataset[inner_idx])
                fp.write(temp + "\n")

        end = time.time()
        print_('Total time taken for file %s: %f' % (heap_paths[idx], (end - start)))

In [6]:
def get_run_length_encoded(data_block):

    idx = 1
    characters = []
    counts = []
    count = 1
    curr_char = data_block[0]
    while idx < len(data_block):
        if data_block[idx] == curr_char:
            idx += 1
            count += 1
            continue

        else:
            characters.append(curr_char)
            counts.append(count)

            count = 1
            curr_char = data_block[idx]

        idx += 1

    # Append the last character and count
    characters.append(curr_char)
    counts.append(count)

    return bytearray(characters), counts

In [7]:
TEST = '../Smart-VMI/data/fast-and-furious'  # Path to the dataset
WRITE_PATH = './models/Slices' # Output file path

# Load the models
clf = load_models(load_high_recall_only=True)

2022-04-25 16:03:27.358025:	Time taken for loading high recall classifier: 0.320688


In [8]:
# Search for all the files with in the test directory
start = time.time()
heap_paths, key_paths = get_dataset_file_paths(TEST)
end = time.time()
print_('Time taken for finding all files: %f' % (end - start))




2022-04-25 16:03:27.369629:	Time taken for finding all files: 0.006016


In [9]:
generate_probable_slices(clf=clf, heap_paths=heap_paths, key_paths=key_paths, root_dir=TEST)

2022-04-25 16:03:27.450261:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/22938-1650640087-heap.raw: 0.075678
2022-04-25 16:03:27.505606:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/22941-1650640087-heap.raw: 0.055278
2022-04-25 16:03:27.550040:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/22944-1650640087-heap.raw: 0.044407
2022-04-25 16:03:27.597283:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/22947-1650640087-heap.raw: 0.047217
2022-04-25 16:03:27.640428:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/22950-1650640087-heap.raw: 0.043074
2022-04-25 16:03:27.690320:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/22953-1650640087-heap.raw: 0.049854
2022-04-25 16:03:27.735957:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/22956-1650640087-heap.raw: 0.045544
2022-04-25 16:03:27.777900:	Total time taken for

2022-04-25 16:03:30.426489:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/23121-1650640087-heap.raw: 0.044755
2022-04-25 16:03:30.475730:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/23124-1650640087-heap.raw: 0.049179
2022-04-25 16:03:30.519680:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/23127-1650640087-heap.raw: 0.043885
2022-04-25 16:03:30.566112:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/23130-1650640087-heap.raw: 0.046405
2022-04-25 16:03:30.642121:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/23133-1650640087-heap.raw: 0.075980
2022-04-25 16:03:30.690515:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/23136-1650640087-heap.raw: 0.048314
2022-04-25 16:03:30.735193:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes128-ctr/23139-1650640087-heap.raw: 0.044594
2022-04-25 16:03:30.778746:	Total time taken for

2022-04-25 16:03:33.416413:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22011-1650633682-heap.raw: 0.082890
2022-04-25 16:03:33.463392:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22014-1650633682-heap.raw: 0.046867
2022-04-25 16:03:33.513388:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22017-1650633682-heap.raw: 0.049926
2022-04-25 16:03:33.565549:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22020-1650633682-heap.raw: 0.052131
2022-04-25 16:03:33.616995:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22023-1650633682-heap.raw: 0.051415
2022-04-25 16:03:33.671643:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22026-1650633682-heap.raw: 0.054621
2022-04-25 16:03:33.723137:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22029-1650633682-heap.raw: 0.051431
2022-04-25 16:03:33.772526:	Total time taken for

2022-04-25 16:03:36.417433:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22197-1650633682-heap.raw: 0.049428
2022-04-25 16:03:36.462497:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22200-1650633682-heap.raw: 0.044991
2022-04-25 16:03:36.505184:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22203-1650633682-heap.raw: 0.042662
2022-04-25 16:03:36.550536:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22206-1650633682-heap.raw: 0.045327
2022-04-25 16:03:36.593279:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22209-1650633682-heap.raw: 0.042719
2022-04-25 16:03:36.639867:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22212-1650633682-heap.raw: 0.046563
2022-04-25 16:03:36.683591:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes192-ctr/22215-1650633682-heap.raw: 0.043666
2022-04-25 16:03:36.725645:	Total time taken for

2022-04-25 16:03:39.363927:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes256-ctr/22775-1650639636-heap.raw: 0.043953
2022-04-25 16:03:39.405923:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes256-ctr/22778-1650639636-heap.raw: 0.041898
2022-04-25 16:03:39.451007:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes256-ctr/22781-1650639636-heap.raw: 0.045060
2022-04-25 16:03:39.493051:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes256-ctr/22784-1650639636-heap.raw: 0.042020
2022-04-25 16:03:39.533995:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes256-ctr/22787-1650639636-heap.raw: 0.040920
2022-04-25 16:03:39.580873:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes256-ctr/22790-1650639636-heap.raw: 0.046853
2022-04-25 16:03:39.624912:	Total time taken for file ../Smart-VMI/data/fast-and-furious/aes256-ctr/22793-1650639636-heap.raw: 0.043966
2022-04-25 16:03:39.666203:	Total time taken for

In [10]:
close() # Close the logging file pointers