# 데이터셋 클래스

직접 구현하신 루틴과 아래 코드를 비교해서 지수만의 Dataset class를 완성하시기 바랍니다.

다음 경로가 존재한다는 가정 하에 개발되었습니다.
- 'dataset/y_test.csv'
- 'dataset/y_train.csv'
- 'dataset/Automotive_Ethernet_with_Attack_original_10_17_20_04_test.pcap'
- 'dataset/Automotive_Ethernet_with_Attack_original_10_17_19_50_training.pcap'

numpy pandas는 이미 있을 테고, `tqdm` 라이브러리 설치가 필요할 수 있습니다.




In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install scapy==2.4.4

Collecting scapy==2.4.4
  Downloading scapy-2.4.4.tar.gz (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scapy
  Building wheel for scapy (setup.py) ... [?25l[?25hdone
  Created wheel for scapy: filename=scapy-2.4.4-py2.py3-none-any.whl size=1189175 sha256=0cf035ff2a76c9d7e9c44ceb8b551b7e162b9c2b97180de7a3199a53efec171e
  Stored in directory: /root/.cache/pip/wheels/53/a6/eb/6f97ac2f21e282974d530a83a76edc9dd97f74e1c94628aa5e
Successfully built scapy
Installing collected packages: scapy
Successfully installed scapy-2.4.4


In [3]:
from IPython.display import display
from pathlib import Path

import numpy as np
import pandas as pd
from scapy.utils import RawPcapReader
from tqdm import tqdm
from scipy.stats import skew

class TimeseriesGenerator:
    pass # 아래에 구현되어 있습니다.

class Dataset:
    def __init__(self, df: pd.DataFrame, trim_etc_protocols=True):
        if trim_etc_protocols:
            self.df = df[df['ProtocolType'] != ''].copy()
        else:
            self.df = df
        assert self.df['abstime'].is_monotonic_increasing
        assert self.df['monotime'].is_monotonic_increasing

    @classmethod
    def _load_towids_dataset(cls, path_pcap, usec_unit, path_csv=None, **kwargs):
        # assert scapy.__version__ == '2.4.4', 'scapy version mismatch.'

        reader = RawPcapReader(str(path_pcap))
        list_output = list()
        for idx, (payload, metadata) in tqdm(enumerate(reader), desc='Parsing the pcap file...'):
            sec, usec, wirelen, caplen = metadata
            list_output.append((sec, usec, wirelen, caplen, payload))
        df_pcap = pd.DataFrame(list_output, columns=['sec', 'usec', 'wirelen', 'caplen', 'payload'])

        if path_csv:
            df_label = pd.read_csv(path_csv, header=None, names=['idx', 'label', 'y_desc'])
            assert df_pcap.shape[0] == df_label.shape[0], \
                f'Record count mismatch. {df_pcap.shape=}, {df_label.shape=}'
            assert (df_label['idx'].diff().bfill() == 1).all(), 'Field `idx` does not increase sequentially.'
            df_label['y'] = df_label['label'].map({'Normal': 0, 'Abnormal': 1})
        else:
            df_label = pd.DataFrame(index=df_pcap.index)
            df_label['y'] = 0
            df_label['y_desc'] = 'Normal'
        abstime = pd.to_datetime(df_pcap['sec'], unit='s') + pd.to_timedelta(df_pcap['usec'], unit=usec_unit)
        dupcounts = abstime.duplicated(keep=False).sum()

        if dupcounts > 0:
            print(f'There were {dupcounts} distinct timestamps.', end=' ')
            for _ in range(100):
                duplicated = abstime.duplicated()
                if duplicated.sum() == 0:
                    break
                abstime[duplicated] += pd.Timedelta(milliseconds=1)
            else:
                raise ValueError('Something went wrong.')
            print(f'-> {_} correction(s).')

        monotime = (abstime - abstime.min()).dt.total_seconds()
        df_pcap['payload'] = df_pcap['payload'].map(lambda x: np.frombuffer(x, dtype='uint8'))

        df: pd.DataFrame = pd.concat([
            abstime.rename('abstime'),
            monotime.rename('monotime'),
            df_pcap[['wirelen', 'caplen', 'payload']],
            df_label[['y', 'y_desc']]
        ], axis=1)

        df = df.sort_values('abstime')
        assert df['abstime'].is_monotonic_increasing
        assert df['monotime'].is_monotonic_increasing

        # Protocol specification
        df['ProtocolType'] = ''
        df.loc[df['wirelen'] == 60, 'ProtocolType'] = 'UDP'
        df.loc[df['wirelen'].isin([68, 90]), 'ProtocolType'] = 'PTP'
        df.loc[df['wirelen'].isin([82, 434]), 'ProtocolType'] = 'AVTP'
        # special treatment
        df.loc[df['y_desc'] == 'P_I', 'ProtocolType'] = 'PTP'

        return cls(df, **kwargs)

    @classmethod
    def towids_train(cls, **kwargs):
        return cls._load_towids_dataset(
            Path("/content/drive/MyDrive/Colab Notebooks/dataset/Automotive_Ethernet_with_Attack_original_10_17_19_50_training.pcap"),
            'ns',
            Path('/content/drive/MyDrive/Colab Notebooks/dataset/y_train.csv'),
            **kwargs
        )

    @classmethod
    def towids_test(cls, **kwargs):
        return cls._load_towids_dataset(
            Path("/content/drive/MyDrive/Colab Notebooks/dataset/Automotive_Ethernet_with_Attack_original_10_17_20_04_test.pcap"),
            'ns',
            Path('/content/drive/MyDrive/Colab Notebooks/dataset/y_test.csv'),
            **kwargs
        )

    def do_label(self, window_size) -> np.ndarray:
        y = self.df.rolling(window=window_size)['y'].max().dropna().astype('int32').values
        assert isinstance(y, np.ndarray)
        return y

    def trim(self, time_start=None, time_end=None, is_absolute=None):
        assert is_absolute is not None
        monotime_min = self.df['monotime'].min()
        monotime_max = self.df['monotime'].max()

        if time_start is not None:
            if is_absolute is False:
                time_start = monotime_min + time_start
            assert monotime_min < time_start
        else:
            time_start = monotime_min

        if time_end is not None:
            if is_absolute is False:
                time_end = monotime_max - time_end
            assert time_end < monotime_max
        else:
            time_end = monotime_max

        df = self.df.query(f'{time_start} <= monotime <= {time_end}').copy()
        # print('Before [{} ~ {}] / Required [{} ~ {}] / After [{} ~ {}]'.format(
        #     monotime_min, monotime_max,
        #     time_start, time_end,
        #     df['monotime'].min(), df['monotime'].max()
        # ))
        return Dataset(df)


    def do_fg1_transition_matrix(self, window_size=2048) -> np.array:
        # 수집된 패킷의 수를 n이라고 할 때, shape=(n, 3, 3)의 numpy array가 나와야 함
        df = self.df
        proto_types = sorted(df['ProtocolType'].unique()) # ex) ['AVTP', 'PTP', 'UDP']
        idx = {proto : i for i, proto in enumerate(proto_types)} # ex) {'AVTP': 0, 'PTP': 1, 'UDP': 2}
        N = len(idx) # ex) 3

        # 1. ProtocolType sequence -> integer index
        proto_seq = df['ProtocolType'].map(idx).values # [2, 0, 0, 1, 2]

        # 2. generate T
        def seq_to_transition_matrix(seq):
          T = np.zeros((N, N), dtype=np.float32)
          for i in range(len(seq) - 1):
            a, b = seq[i], seq[i+1]
            T[a, b] += 1
          T /= (len(seq)-1) # normalization
          return T

        if len(proto_seq) < window_size:
          raise ValueError(f"Insufficient data length ({len(proto_seq)}) for window_size {window_size}")

        # data와 window_size가 적절한지 확인
        print("Data shape:", proto_seq.shape)
        print("Window size:", window_size)

        # 3. sliding window using TimeseriesGenerator
        generator = TimeseriesGenerator(proto_seq, length=window_size, sampling_rate=1, stride=1, batch_size=1, shuffle=False)

        print("Generator length:", len(generator))
        if len(generator) == 0:
          print("Warning: Generator is empty! Check window_size and data length.")
          return np.zeros((0, N, N))

        result = []
        for X, _ in generator:
          # 임시 디버깅 코드...
          if len(X) == 0:  # 또는 X.shape[0] == 0 (numpy array면)
            raise ValueError("FG1: Empty window detected. Skipping...")
          seq = X[0] # (window_size, )
          T = seq_to_transition_matrix(seq)
          result.append(T)

        return np.stack(result) # (num_windows, N, N)



    def do_fg2_payload(self, window_size=2048, byte_start=0x22, byte_end=0x22 + 9) -> np.array:
        # 각 패킷에 탑재된 페이로드에 대해 0x22 번째 byte부터 9개 byte를 취하는 것이 논문의 전략. payload 길이가 짧은 수도 있으니 주의. 짧은 payloaed에 대해서는 0x00으로 패딩하여야 함
        # 수집된 패킷의 수를 n이라고 할 때, shape=(n, 9)의 numpy array가 나와야 함. FG2는 TimeseriesGenerator 적용이 필요 없습니다. ;-)
        assert byte_start < byte_end
        num_bytes = byte_end - byte_start # 9

        payloads = []
        for arr in self.df['payload'].values:
          segment = np.zeros(num_bytes, dtype=np.uint8) # [0, 0, 0, ..., 0]
          arr_len = len(arr)
          for i in range(num_bytes): # 9회
            if byte_start + i < arr_len:
              segment[i] = arr[byte_start + i]
          payloads.append(segment / 255.0)

        return np.array(payloads) # (n ,9)

    def do_fg3_statistics(self, window_size=2048, methods=('mean', 'std', 'skew')) -> np.array:
        # 수집된 패킷의 수를 n이라고 할 때, shape=(n, 3, 3)의 numpy array가 나와야 함
        # 논문 5페이지 우측 하단에 설명 <feature normalization 전략> 역시 구현해 주어야 합니다.
        df = self.df
        proto_types = sorted(df['ProtocolType'].unique()) # ex) ['AVTP', 'PTP', 'UDP']
        idx = {proto : i for i, proto in enumerate(proto_types)} # ex) {'AVTP': 0, 'PTP': 1, 'UDP': 2}
        N = len(idx) # ex) 3

        monotime = df['monotime'].values
        protos = df['ProtocolType'].map(idx).values

        # each window is constructed as [window_size * 2]
        generator = TimeseriesGenerator(
            np.stack([monotime, protos], axis=1), # (n, 2)
            length = window_size,
            sampling_rate = 1,
            stride = 1,
            batch_size = 1,
            shuffle = False
            )

        # data와 window_size가 적절한지 확인
        print("Data shape:", np.stack([monotime, protos], axis=1).shape)
        print("Window size:", window_size)

        result = []
        for X, _ in generator:
          # 임시 디버깅 코드...
          if len(X) == 0:  # 또는 X.shape[0] == 0
            print("FG3: Empty window detected. Skipping...")
            continue
          x_window = X[0] # (window_size, 2)
          t = x_window[:, 0] # monotime 첫 번째 열 [1.0, 1.2, 1.3, 2.0, ...]
          p = x_window[:, 1].astype(int) # protocol index 두 번째 열 [0, 0, 1, 0]

          stat_matrix = np.zeros((N, 3), dtype=np.float32)

          for i in range(N):
            t_i = t[p == i] # time sequence of the ith protocol / t : [1.0, 1.2, 1.3, 2.0, ...] / p==i : [True, True, False, True, ...] / t[p==i] : [1.0, 1.2, 2.0] 로 해당 프로토콜만 선택하는 구조
            if len(t_i) < 2:
              stat_matrix[i] = [1e+7, 1e+7, 1e+7]
            elif len(t_i) == 2:
              # std는 계산 가능하지만 skew가 1e+7인 경우
              diffs = np.diff(t_i) # 인접한 두 값의 시간 차이(b-a)
              mean_val = np.mean(diffs)
              std_val = np.std(diffs)
              stat_matrix[i, 0] = np.log10(mean_val + 1e-7)
              stat_matrix[i, 1] = np.log10(std_val + 1e-7)
              stat_matrix[i, 2] = 1e+7  # skew는 계산 불가
            else:
              diffs = np.diff(t_i) # interval

              mean_val = np.mean(diffs)
              std_val = np.std(diffs)
              skew_val = abs(skew(diffs)) if len(diffs) >= 3 else 1e+7

              stat_matrix[i, 0] = np.log10(mean_val + 1e-7)
              stat_matrix[i, 1] = np.log10(std_val + 1e-7)
              stat_matrix[i, 2] = np.log10(skew_val + 1e-7)

          result.append(stat_matrix)
        return np.stack(result) # (num_windows, N, 3)

dataset_train = Dataset.towids_train()
dataset_test = Dataset.towids_test()

Parsing the pcap file...: 1203737it [00:07, 151121.49it/s]
Parsing the pcap file...: 791611it [00:05, 135647.26it/s]


There were 2 distinct timestamps. -> 1 correction(s).


In [4]:
df = dataset_train.df
proto_types = sorted(df['ProtocolType'].unique()) # ex) ['AVTP', 'PTP', 'UDP']
idx = {proto : i for i, proto in enumerate(proto_types)} # ex) {'AVTP': 0, 'PTP': 1, 'UDP': 2}
N = len(idx) # ex) 3

# 1. ProtocolType sequence -> integer index
proto_seq = df['ProtocolType'].map(idx).values # [2, 0, 0, 1, 2]

In [11]:
proto_seq = proto_seq[:2049] # window 개수 2개가 되도록

In [15]:
window_size = 2048

# 2. generate T
def seq_to_transition_matrix(seq):
  T = np.zeros((N, N), dtype=np.float32)
  for i in range(len(seq) - 1):
    a, b = seq[i], seq[i+1]
    T[a, b] += 1
  T /= (len(seq)-1) # normalization
  return T

if len(proto_seq) < window_size:
  raise ValueError(f"Insufficient data length ({len(proto_seq)}) for window_size {window_size}")

# data와 window_size가 적절한지 확인
print("Data shape:", proto_seq.shape)
print("Window size:", window_size)

# 3. sliding window using TimeseriesGenerator
generator = TimeseriesGenerator(proto_seq, length=window_size, sampling_rate=1, stride=1, batch_size=1, shuffle=False)

print("Generator length:", len(generator))
if len(generator) == 0:
  print("Warning: Generator is empty! Check window_size and data length.")

Data shape: (2049,)
Window size: 2048
Generator length: 2


In [16]:
result = []
for i, (X, _) in enumerate(generator):
  # 임시 디버깅 코드...
  if len(X) == 0:  # 또는 X.shape[0] == 0 (numpy array면)
    raise ValueError(f"FG1: Empty window detected. Skipping...{i}")
  seq = X[0] # (window_size, )
  T = seq_to_transition_matrix(seq)
  result.append(T)

In [17]:
len(result)

2

generator의 i값을 보면 에러값 재현 가능 : g[i] 이용

In [10]:
i # 1201287

922972

# 첫 번째 파일

In [None]:
display(dataset_train.df)
print(dataset_train.df['ProtocolType'].value_counts())
print(dataset_train.df['y_desc'].value_counts())

Unnamed: 0,abstime,monotime,wirelen,caplen,payload,y,y_desc,ProtocolType
0,2020-09-12 09:51:04.715221,0.000000,434,434,"[145, 239, 0, 0, 254, 0, 0, 252, 112, 0, 0, 3,...",0,Normal,AVTP
1,2020-09-12 09:51:04.715245,0.000024,434,434,"[145, 239, 0, 0, 254, 0, 0, 252, 112, 0, 0, 3,...",0,Normal,AVTP
2,2020-09-12 09:51:04.715326,0.000105,434,434,"[145, 239, 0, 0, 254, 0, 0, 252, 112, 0, 0, 3,...",0,Normal,AVTP
3,2020-09-12 09:51:04.715450,0.000229,434,434,"[145, 239, 0, 0, 254, 0, 0, 252, 112, 0, 0, 3,...",0,Normal,AVTP
4,2020-09-12 09:51:04.715559,0.000338,434,434,"[145, 239, 0, 0, 254, 0, 0, 252, 112, 0, 0, 3,...",0,Normal,AVTP
...,...,...,...,...,...,...,...,...
1203732,2020-09-12 10:00:16.911784,552.196563,60,60,"[220, 166, 50, 94, 72, 71, 220, 166, 50, 93, 2...",0,Normal,UDP
1203733,2020-09-12 10:00:16.912231,552.197010,60,60,"[220, 166, 50, 94, 72, 71, 220, 166, 50, 93, 2...",0,Normal,UDP
1203734,2020-09-12 10:00:16.912686,552.197465,60,60,"[220, 166, 50, 94, 72, 71, 220, 166, 50, 93, 2...",0,Normal,UDP
1203735,2020-09-12 10:00:16.913172,552.197951,60,60,"[220, 166, 50, 94, 72, 71, 220, 166, 50, 93, 2...",0,Normal,UDP


ProtocolType
UDP     846647
AVTP    287086
PTP      69601
Name: count, dtype: int64
y_desc
Normal    954509
C_D        85466
P_I        64635
F_I        35112
M_F        33765
C_R        29847
Name: count, dtype: int64


# 두 번째 파일

In [None]:
display(dataset_test.df)
print(dataset_test.df['ProtocolType'].value_counts())
print(dataset_test.df['y_desc'].value_counts())

Unnamed: 0,abstime,monotime,wirelen,caplen,payload,y,y_desc,ProtocolType
0,2020-09-12 10:02:59.795192,0.000000,434,434,"[145, 239, 0, 0, 254, 0, 0, 252, 112, 0, 0, 3,...",0,Normal,AVTP
1,2020-09-12 10:02:59.810189,0.014997,434,434,"[145, 239, 0, 0, 254, 0, 0, 252, 112, 0, 0, 3,...",0,Normal,AVTP
2,2020-09-12 10:02:59.810205,0.015013,434,434,"[145, 239, 0, 0, 254, 0, 0, 252, 112, 0, 0, 3,...",0,Normal,AVTP
3,2020-09-12 10:02:59.810295,0.015103,434,434,"[145, 239, 0, 0, 254, 0, 0, 252, 112, 0, 0, 3,...",0,Normal,AVTP
4,2020-09-12 10:02:59.810414,0.015222,434,434,"[145, 239, 0, 0, 254, 0, 0, 252, 112, 0, 0, 3,...",0,Normal,AVTP
...,...,...,...,...,...,...,...,...
791606,2020-09-12 10:09:36.422031,396.626839,60,60,"[220, 166, 50, 94, 72, 71, 220, 166, 50, 93, 2...",0,Normal,UDP
791607,2020-09-12 10:09:36.422535,396.627343,60,60,"[220, 166, 50, 94, 72, 71, 220, 166, 50, 93, 2...",0,Normal,UDP
791608,2020-09-12 10:09:36.422997,396.627805,60,60,"[220, 166, 50, 94, 72, 71, 220, 166, 50, 93, 2...",0,Normal,UDP
791609,2020-09-12 10:09:36.423462,396.628270,60,60,"[220, 166, 50, 94, 72, 71, 220, 166, 50, 93, 2...",0,Normal,UDP


ProtocolType
UDP     563731
AVTP    198013
PTP      29580
Name: count, dtype: int64
y_desc
Normal    660490
C_D        41203
C_R        29847
P_I        26013
F_I        16962
M_F        16809
Name: count, dtype: int64


In [None]:
import pandas as pd

args = [
    [dataset_train, 'Train', 5, 60, False],
    [dataset_train, 'Validation', 60, 71.11, False],
    [dataset_train, 'Test', 71.11, None, True],
    [dataset_test, 'Train', 5, 80, False],
    [dataset_test, 'Validation', 80, 91.88, False],
    [dataset_test, 'Test', 91.89, None, True],
]

def do(dataset, purpose, time_start, time_end, trim_last_5sec):
    name = 'Packet dump 1' if dataset is dataset_train else 'Packet dump 2'

    dataset = dataset.trim(time_start, time_end, is_absolute=True)
    if trim_last_5sec: # 데이터 수집이 종료된 뒤에 남아있는 노이즈 제거
        dataset = dataset.trim(time_end=5, is_absolute=False)
        time_end = dataset.df['monotime'].max()
    a = dataset.df['y'].value_counts()
    a.name = name
    a['Purpose'] = purpose
    a['Time range'] = '[{:.2f}, {:.2f}]'.format(time_start, time_end)
    a = a.rename({0: 'Benign', 1: 'Intrusion'})
    a = a.reindex(['Purpose', 'Time range', 'Benign', 'Intrusion'], fill_value=0)
    return a, dataset


list_output = list()
list_dataset_sub = list() ######### 여기에서 필요에 따라 Dataset 인스턴스를 꺼내 쓰면 됩니다.
for arg in args:
    output, dataset_sub = do(*arg)
    list_output.append(output)
    list_dataset_sub.append(dataset_sub)

df = pd.DataFrame(list_output)
df.index.name = 'Packet dump'
df[['Benign', 'Intrusion']] = df[['Benign', 'Intrusion']].map('{:,}'.format)
df

y,Purpose,Time range,Benign,Intrusion
Packet dump,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Packet dump 1,Train,"[5.00, 60.00]",97715,0
Packet dump 1,Validation,"[60.00, 71.11]",19606,0
Packet dump 1,Test,"[71.11, 547.20]",819586,248080
Packet dump 2,Train,"[5.00, 80.00]",130520,0
Packet dump 2,Validation,"[80.00, 91.88]",19943,0
Packet dump 2,Test,"[91.89, 391.63]",496151,129226


# 클래스 기반 배치/피쳐 제너레이터 만들기

yield 키워드를 대신해 TimeseriesGenerator를 소개해 드립니다. 클래스 구현물을 모두 읽기보다는 아래 몇가지 example을 먼저 참고해 주세요.

In [14]:
import numpy as np


class TimeseriesGenerator:
    def __init__(self, data, length, sampling_rate=1, stride=1,
                 start_index=0, end_index=None,
                 shuffle=False, reverse=False, batch_size=128, label=None):
        self.data = data
        self.length = length
        self.sampling_rate = sampling_rate
        self.stride = stride
        self.start_index = start_index + length
        if end_index is None:
            end_index = len(data)
        self.end_index = end_index
        self.shuffle = shuffle
        self.reverse = reverse
        self.batch_size = batch_size
        self.label = label if label is None else np.array(label)
        if self.start_index > self.end_index:
            raise ValueError(
                "`start_index+length=%i > end_index=%i` "
                "is disallowed, as no part of the sequence "
                "would be left to be used as current step."
                % (self.start_index, self.end_index)
            )

    def __len__(self):
        return (self.end_index - self.start_index + self.batch_size * self.stride) // (self.batch_size * self.stride)

    def __getitem__(self, index):
        rows = self.__index_to_row__(index)
        samples, y = self.__compile_batch__(rows)
        return samples, y

    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

    def __index_to_row__(self, index):  # 특정 배치 (index)를 구성할 row 목록을 반환한다. len(rows)는 batch size와 같다.
        if self.shuffle:
            rows = np.random.randint(self.start_index, self.end_index + 1, size=self.batch_size)
        else:
            i = self.start_index + self.batch_size * self.stride * index
            rows = np.arange(i, min(i + self.batch_size * self.stride, self.end_index + 1), self.stride)
        return rows

    def __compile_batch__(self, rows):  # 주어진 row 별 time series feature를 생성한다.
        samples = np.array([self.data[row - self.length: row: self.sampling_rate] for row in rows])
        if self.reverse:
            samples = samples[:, ::-1, ...]
        if self.length == 1:
            samples = np.squeeze(samples)

        if self.label is None:
            return samples, samples
        else:
            return samples, self.label[rows - self.length]

    @property
    def output_shape(self):
        x, y = self[0]
        return x.shape, y.shape

    @property
    def num_samples(self):
        count = 0
        for x, y in self:
            count += x.shape[0]
        return count

    def __str__(self):
        return '<TimeseriesGenerator data.shape={} / num_batches={:,} / output_shape={}>'.format(
            self.data.shape, len(self), self.output_shape,
        )

    def __repr__(self):
        return self.__str__()



In [None]:
# 데모를 위한 가상의 데이터
arr = np.arange(1000)
window_size = 10

## 배치 사이즈를 1로 할 경우

In [None]:
print('When batch_size=1:')
batch_size = 1
tg = TimeseriesGenerator(data=arr, length=window_size, batch_size=batch_size)
print(tg)
x, y = tg[0] # x와 y는 같은 값이 반환된다. 오토인코더 학습에 최적화 되어 있다.
print(f'First batch / {x=} / {y=}')
x, y = tg[1]
print(f'First batch / {x=} / {y=}')

When batch_size=1:
<TimeseriesGenerator data.shape=(1000,) / num_batches=991 / output_shape=((1, 10), (1, 10))>
First batch / x=array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]) / y=array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
First batch / x=array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]]) / y=array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]])


## 배치 사이즈를 2로 할 경우

In [None]:
print('When batch_size=2:')
batch_size = 2
tg = TimeseriesGenerator(data=arr, length=window_size, batch_size=batch_size)
print(tg)
x, y = tg[0]
print(f'First batch / {x=} / {y=}')
x, y = tg[1]
print(f'First batch / {x=} / {y=}')

When batch_size=2:
<TimeseriesGenerator data.shape=(1000,) / num_batches=496 / output_shape=((2, 10), (2, 10))>
First batch / x=array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]]) / y=array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]])
First batch / x=array([[ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12]]) / y=array([[ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12]])


## 레이블을 줄 수도 있음. x와 구분하기 위해 1000을 더해 봄

In [None]:
y = (arr + 1000)
print('When batch_size=2 and y is given:')
batch_size = 2
tg = TimeseriesGenerator(data=arr, length=window_size, batch_size=batch_size, label=y)
print(tg)
x, y = tg[0]
print(f'First batch / {x=} / {y=}')
x, y = tg[1]
print(f'First batch / {x=} / {y=}')

When batch_size=2 and y is given:
<TimeseriesGenerator data.shape=(1000,) / num_batches=496 / output_shape=((2, 10), (2,))>
First batch / x=array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]]) / y=array([1000, 1001])
First batch / x=array([[ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12]]) / y=array([1002, 1003])


# 지수의 mission

- TimeseriesGenerator를 활용하여 Feature Extractor (세 가지 Feature Generator) 구현
- 질문 있으면 아무때나 찾아올 것

In [None]:
dataset = list_dataset_sub[0]
dataset.do_fg1_transition_matrix()

Data shape: (97715,)
Window size: 2048
Generator length: 95668


ValueError: FG1: Empty window detected. Skipping...

In [None]:
dataset.do_fg2_payload()

array([[0.91372549, 0.94509804, 0.91764706, ..., 0.72156863, 0.27058824,
        0.03137255],
       [0.8       , 0.8627451 , 0.91764706, ..., 0.94117647, 0.59607843,
        0.03137255],
       [0.92941176, 0.64313725, 0.91764706, ..., 0.03529412, 0.80784314,
        0.03137255],
       ...,
       [0.77647059, 0.78823529, 0.91764706, ..., 0.62745098, 0.75294118,
        0.03137255],
       [0.        , 0.        , 0.        , ..., 0.37254902, 0.62745098,
        0.24705882],
       [0.        , 0.        , 0.        , ..., 0.37254902, 0.62745098,
        0.24705882]])

In [None]:
dataset.do_fg3_statistics()