In [1]:
import time
from collections import defaultdict

import numpy as np
import pandas as pd
import tensorflow as tf

import csv

In [2]:
def get_features():
    features = defaultdict(list)
    features_path = '../csv/preprocessed/features_timestamp_model.csv'
    str_to_float = lambda x: round(float(x), 10)
    with open(features_path, newline='', mode='r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader) # 헤더 넘기기
        for page_id, frequency, timeinterval_avg, timeinterval_std, size_avg, last_timestamp in reader:
            page_id = int(page_id)
            frequency = int(frequency)
            timeinterval_avg = str_to_float(timeinterval_avg)
            timeinterval_std = str_to_float(timeinterval_std)
            size_avg = str_to_float(size_avg)
            last_timestamp = str_to_float(last_timestamp)
            features[page_id] = [frequency, timeinterval_avg, timeinterval_std, size_avg, last_timestamp]
    return features

In [3]:
def read_io_trace():
    io_trace = []
    io_trace_path = '../csv/preprocessed/iotrace_simulation.csv'
    str_to_float = lambda x: round(float(x), 10)
    with open(io_trace_path, newline='', mode='r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader) # 첫 행 제거
        for logical_page_address, size, timestamp in reader:
            logical_page_address = int(logical_page_address)
            size = int(size)
            timestamp = str_to_float(timestamp)
            io_trace.append([logical_page_address, size, timestamp])

    return io_trace

In [4]:
def update_features(logical_page_address, size, timestamp):
    if logical_page_address not in features:
        features[logical_page_address] = [1, 3000, 3000, size, timestamp]
    else:
        frequency, timeinterval_avg, timeinterval_std, size_avg, last_timestamp = features[logical_page_address]
        timeinterval = timestamp - last_timestamp
        if frequency == 1:
            features[logical_page_address][1] = timeinterval
        else:
            features[logical_page_address][1] = float((timeinterval_avg * (frequency - 1) + timeinterval) / frequency)

        features[logical_page_address][0] = frequency + 1
        features[logical_page_address][3] = float((size_avg * frequency + size) / (frequency + 1))
        features[logical_page_address][4] = timestamp

In [5]:
def update_features_nofreq(logical_page_address, size, timestamp):
    if logical_page_address not in features:
        features[logical_page_address] = [1, 3000, 3000, size, timestamp]
    else:
        frequency, timeinterval_avg, timeinterval_std, size_avg, last_timestamp = features[logical_page_address]
        timeinterval = timestamp - last_timestamp
        if frequency == 1:
            features[logical_page_address][1] = timeinterval
        else:
            features[logical_page_address][1] = float((timeinterval_avg * (frequency - 1) + timeinterval) / frequency)

        features[logical_page_address][0] = frequency + 1
        features[logical_page_address][3] = float((size_avg * frequency + size) / (frequency + 1))
        features[logical_page_address][4] = timestamp

In [6]:
# features[page_id] = [frequency, timeinterval_avg, timeinterval_std, size_avg, last_timestamp]

features = get_features()
io_trace = read_io_trace()

decoded_dict = {0: 'Cold', 1: 'Hot', 2: 'Warm'}

In [7]:
def predict_with_h5(model_path) -> list:

    start = time.time()
    model = tf.keras.models.load_model(model_path)
    labeled_iotrace_simulation = []
    for idx, (logical_page_address, size, timestamp) in enumerate(io_trace):
        if idx % 1000 == 0:
            print(idx)

        update_features(logical_page_address, size, timestamp)

        input_features = np.array([features[logical_page_address][:-1]]) # last_timestamp 제외
        input_features = input_features.reshape((input_features.shape[0], 1, input_features.shape[1]))

        probabilities = model.predict(input_features, verbose=0) # verbose 0 : 진행상황 출력 x
        predict_label = np.argmax(probabilities, axis=1)[0]
        label = decoded_dict[predict_label]

        labeled_iotrace_simulation.append([logical_page_address, label])

    print("time elapsed: ", time.time() - start)

    return labeled_iotrace_simulation

In [20]:
def predict_with_pb(model_path) -> list:

    start = time.time()

    # 모델 불러오기
    loaded_model = tf.saved_model.load(model_path)

    # 모델의 서명 얻기
    infer = loaded_model.signatures["serving_default"]

    labeled_iotrace_simulation = []
    for idx, (logical_page_address, size, timestamp) in enumerate(io_trace):
        if idx % 10000 == 0:
            print(idx, time.time() - start)

        update_features(logical_page_address, size, timestamp)

        input_features = np.array([features[logical_page_address][1:-1]]) # last_timestamp 제외
        input_features = input_features.reshape((input_features.shape[0], 1, input_features.shape[1]))
        input_features = tf.cast(input_features, tf.float32)

        output = infer(tf.constant(input_features))
        probabilities = output[list(output.keys())[0]].numpy()
        label = np.argmax(probabilities, axis=1)[0]
        label = decoded_dict[label]

        labeled_iotrace_simulation.append([logical_page_address, label])

    print("time elapsed: ", time.time() - start)

    return labeled_iotrace_simulation

In [21]:
# predict_with_h5('./model/lstm.h5')
labeled_iotrace_simulation = predict_with_pb('../model/lstm_model/lstm_noFreq')

0 1.6665937900543213
10000 9.901370763778687
20000 18.681000232696533
30000 27.897451877593994
40000 36.4984917640686
50000 44.74981880187988
60000 52.97277092933655
70000 61.257346630096436
80000 69.83043789863586
90000 78.21106958389282
100000 86.81777358055115
110000 95.69324445724487
120000 103.90110802650452
130000 112.16046786308289
140000 120.68367552757263
150000 129.5092523097992
160000 138.29940724372864
170000 146.8634226322174
180000 155.2385289669037
190000 163.59748530387878
200000 171.97208881378174
210000 180.07063460350037
220000 188.31915378570557
230000 196.60395669937134
240000 205.0013496875763
250000 213.24021291732788
260000 221.51374530792236
270000 229.8020215034485
280000 238.56622576713562
290000 246.95885491371155
300000 255.2697970867157
310000 263.6129455566406
320000 271.87651562690735
330000 280.12328028678894
340000 288.11288142204285
350000 296.1307158470154
360000 304.4663290977478
370000 313.5631170272827
380000 321.87827467918396
390000 330.13824224

In [22]:
column = ['PageId', 'Label']
df = pd.DataFrame(labeled_iotrace_simulation, columns=column)
df.to_csv('../csv/preprocessed/labeled_iotrace_simulation.csv', index=False)

In [23]:
# 100만개에 812초 : 15분 .

In [24]:
df = pd.read_csv('../csv/preprocessed/labeled_iotrace_simulation.csv')
df.head()

Unnamed: 0,PageId,Label
0,60637,Cold
1,14956,Warm
2,60638,Cold
3,29750,Warm
4,106213,Hot


In [25]:
print(len(df[df['Label'] == 'Cold']))
print(len(df[df['Label'] == 'Hot']))
print(len(df[df['Label'] == 'Warm']))

498981
207168
3233623
