In [3]:
import pandas as pd
from collections import defaultdict
import csv
import numpy as np

In [2]:
columns = ['PageId', 'Size', 'Timestamp']
iotrace_path = '../csv/preprocessed/iotrace_model.csv'
iotrace = pd.read_csv(iotrace_path, header=None, names=columns)
iotrace.head()

Unnamed: 0,PageId,Size,Timestamp
0,37945,3584,0.0
1,6948,3072,0.0
2,37946,3584,0.026214
3,30105,3072,0.026214
4,37947,3584,0.117964


In [3]:
def calculate_features(io_trace):
    """
    주어진 I/O 트레이스 데이터를 처리하여 각 PageId에 대한 특징을 계산합니다.
    """
    # 데이터를 시간 순으로 정렬
    io_trace = io_trace.sort_values(by='Timestamp')

    # 각 PageId에 대한 통계 계산을 위한 빈 리스트 준비
    page_ids = []
    frequency = []
    timeinterval_avg = []
    timeinterval_std = []
    size_avg = []

    # 각 PageId에 대해 계산 수행
    for page_id, group in io_trace.groupby('PageId'):
        page_ids.append(page_id)
        frequency.append(len(group))
        time_intervals = group['Timestamp'].diff().dropna()
        if time_intervals.empty:
            timeinterval_avg.append(-1)
            timeinterval_std.append(-1)
        else:
            timeinterval_avg.append(time_intervals.mean())
            timeinterval_std.append(time_intervals.std())

        size_avg.append(group['Size'].mean())

    # 결과를 데이터프레임으로 합치기
    features = pd.DataFrame({
        'PageId': page_ids,
        'Frequency': frequency,
        'Timeinterval-avg': timeinterval_avg,
        'Timeinterval-std': timeinterval_std,
        'Size-avg': size_avg
    })

    return features

In [4]:
features = calculate_features(iotrace)
timeinterval_max = features['Timeinterval-avg'].max()
# -1 데이터의 의미 : frequency가 1이라, avg, std를 계산하지 못하는 경우.
features.replace({'Timeinterval-avg': -1}, timeinterval_max + 0.1, inplace=True)
features.replace({'Timeinterval-std': -1}, timeinterval_max + 0.1, inplace=True)

# NaN 데이터의 의미 : frequency가 2라, interval이 1개라 std가 NaN인 경우
# avg가 max이지만, std가 0이 된다면, hot으로 분류되는 문제점 존재 -> 최대값으로 치환
features.fillna(timeinterval_max + 0.1, inplace=True)
features.to_csv('../csv/preprocessed/features_model.csv', index=False)

In [5]:
features.head()

Unnamed: 0,PageId,Frequency,Timeinterval-avg,Timeinterval-std,Size-avg
0,0,104,104.177744,599.38597,512.0
1,1,178,61.051303,417.857584,512.0
2,2,128,84.295964,540.087874,512.0
3,3,138,78.143073,413.7773,537.971014
4,4,129,83.637812,535.740946,539.782946


In [None]:
# 마지막 timestamp를 추가하는 과정

In [8]:
def get_features():
    features = defaultdict(list)
    features_path = '../csv/preprocessed/features_model.csv'
    str_to_float = lambda x: round(float(x), 10)
    with open(features_path, newline='', mode='r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader) # 헤더 넘기기
        for page_id, frequency, timeinterval_avg, timeinterval_std, size_avg, in reader:
            page_id = int(page_id)
            frequency = int(frequency)
            timeinterval_avg = str_to_float(timeinterval_avg)
            timeinterval_std = str_to_float(timeinterval_std)
            size_avg = str_to_float(size_avg)
            features[page_id] = [frequency, timeinterval_avg, timeinterval_std, size_avg]
    return features

In [9]:
features = get_features()

In [10]:
iotrace_path = '../csv/preprocessed/iotrace_model.csv'
timestamp_dict = defaultdict(float)
str_to_float = lambda x: round(float(x), 10)
with open(iotrace_path, newline='', mode='r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) # 첫 행 제거
    for logical_page_address, size, timestamp in reader:
        logical_page_address = int(logical_page_address)
        timestamp = str_to_float(timestamp)
        timestamp_dict[logical_page_address] = timestamp
    for logical_page_address, timestamp in timestamp_dict.items():
        features[logical_page_address].append(timestamp)

In [26]:
df = pd.DataFrame(features).transpose()
df.reset_index(inplace=True)
df.columns = ['PageId', 'Frequency', 'Timeinterval-avg', 'Timeinterval-std', 'Size', 'Last-Timestamp']
df['PageId'] = df['PageId'].astype('int32')
df['Frequency'] = df['Frequency'].astype('int32')

In [27]:
df.to_csv('../csv/preprocessed/features_timestamp_model.csv', index=False)