# Prepare

In [1]:
# import libraries
import os
import os.path
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
SOURCE = 'Y:\\ZBU\\_Datasets\\TEP\\cyber-attack'  # dataset source dir
TARGET = 'E:\\Datasets\\TEP\\kaspersky\\_pretreated'  # dataset target dir

# Stages

In [3]:
def rename_columns(df: pd.DataFrame) -> None:
    columns = [
        "time",
        "inp_a_flow_ksm3h",
        "inp_d_flow_kgh",
        "inp_e_flow_kgh",
        "inp_c_flow_ksm3h",
        "recyl_flow_ksm3h",
        "react_flow_ksm3h",
        "react_press_kpa",
        "react_level_pc",
        "react_temp_gc",
        "purge_flow_ksm3h",
        "seprt_temp_gc",
        "seprt_level_pc",
        "seprt_press_kpa",
        "seprt_flow_m3h",
        "strip_level_pc",
        "strip_press_kpa",
        "prod_flow_m3h",
        "strip_temp_gc",
        "steam_flow_kgh",
        "compr_power_kw",
        "re_cl_temp_gc",
        "co_cl_temp_gc",
        "react_a_prt_molp",
        "react_b_prt_molp",
        "react_c_prt_molp",
        "react_d_prt_molp",
        "react_e_prt_molp",
        "react_f_prt_molp",
        "purge_a_prt_molp",
        "purge_b_prt_molp",
        "purge_c_prt_molp",
        "purge_d_prt_molp",
        "purge_e_prt_molp",
        "purge_f_prt_molp",
        "purge_g_prt_molp",
        "purge_h_prt_molp",
        "prod_d_prt_molp",
        "prod_e_prt_molp",
        "prod_f_prt_molp",
        "prod_g_prt_molp",
        "prod_h_prt_molp",
        "inp_d_feed_pc",
        "inp_e_feed_pc",
        "inp_a_feed_pc",
        "inp_c_feed_pc",
        "compr_valv_pc",
        "purge_feed_pc",
        "seprt_feed_pc",
        "strip_feed_pc",
        "steam_feed_pc",
        "re_cl_feed_pc",
        "co_cl_feed_pc",
        "agit_speed_pc",
        "is_mv_attack",
        "is_meas_attack",
        "is_sp_attack",
        "state",
        "product_rate",
        "hourly_cost",
    ]
    df.columns = columns
    return

In [4]:
def set_index(df: pd.DataFrame) -> None:
    df['time'] = pd.to_datetime(df['time'], unit='h', origin='2017-05-16T00:00:00')
    df.set_index('time', inplace=True)
    df.index.name = None
    return

In [5]:
def downsample(df: pd.DataFrame) -> pd.DataFrame:
    return df.resample('1 min').first()

In [6]:
def trim_features(df: pd.DataFrame) -> None:
    df['anomaly'] = df['is_mv_attack']*32 + df['is_meas_attack']*64 + df['is_sp_attack']*128
    df.drop(columns=["is_mv_attack", "is_meas_attack", "is_sp_attack", "state", "product_rate", "hourly_cost"],
            inplace=True,
           )
    return

In [7]:
def optimize_dtypes(df: pd.DataFrame) -> None:
    # optimize dataframe by memory usage
    uint_columns = ['anomaly',]
    float_columns = [c for c in df.columns if c not in uint_columns]
    
    df[uint_columns] = df[uint_columns].apply(pd.to_numeric, downcast='unsigned')
    df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast='float')
    
    return

# Convert dataset

In [8]:
if not os.path.isdir(os.path.join(TARGET)):
    os.mkdir(TARGET)

subdirs = ('single_states', 
           'transient_processes',
           'attacks',
          )

for sub in subdirs:
    files = os.listdir(os.path.join(SOURCE, sub))
    print(f'{len(files)} files in {sub}')
    for f in tqdm([csv for csv in files if csv.endswith('.csv')]):
        data = pd.read_csv(os.path.join(SOURCE, sub, f), header=None)
        rename_columns(data)
        set_index(data)
        data = downsample(data)
        trim_features(data)
        optimize_dtypes(data)
        p = os.path.splitext(f)[0] + '.snappy'
        data.to_parquet(os.path.join(TARGET, p), compression='snappy')

200 files in single_states


  0%|          | 0/200 [00:00<?, ?it/s]

672 files in transient_processes


  0%|          | 0/346 [00:00<?, ?it/s]

142 files in attacks


  0%|          | 0/142 [00:00<?, ?it/s]

# Self-Check

In [9]:
data = pd.read_parquet(os.path.join(TARGET, p))
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7200 entries, 2017-05-16 00:00:00 to 2017-05-20 23:59:00
Data columns (total 54 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   inp_a_flow_ksm3h  7200 non-null   float32
 1   inp_d_flow_kgh    7200 non-null   float32
 2   inp_e_flow_kgh    7200 non-null   float32
 3   inp_c_flow_ksm3h  7200 non-null   float32
 4   recyl_flow_ksm3h  7200 non-null   float32
 5   react_flow_ksm3h  7200 non-null   float32
 6   react_press_kpa   7200 non-null   float32
 7   react_level_pc    7200 non-null   float32
 8   react_temp_gc     7200 non-null   float32
 9   purge_flow_ksm3h  7200 non-null   float32
 10  seprt_temp_gc     7200 non-null   float32
 11  seprt_level_pc    7200 non-null   float32
 12  seprt_press_kpa   7200 non-null   float32
 13  seprt_flow_m3h    7200 non-null   float32
 14  strip_level_pc    7200 non-null   float32
 15  strip_press_kpa   7200 non-null   float32
 16  prod_f

In [10]:
data.index

DatetimeIndex(['2017-05-16 00:00:00', '2017-05-16 00:01:00',
               '2017-05-16 00:02:00', '2017-05-16 00:03:00',
               '2017-05-16 00:04:00', '2017-05-16 00:05:00',
               '2017-05-16 00:06:00', '2017-05-16 00:07:00',
               '2017-05-16 00:08:00', '2017-05-16 00:09:00',
               ...
               '2017-05-20 23:50:00', '2017-05-20 23:51:00',
               '2017-05-20 23:52:00', '2017-05-20 23:53:00',
               '2017-05-20 23:54:00', '2017-05-20 23:55:00',
               '2017-05-20 23:56:00', '2017-05-20 23:57:00',
               '2017-05-20 23:58:00', '2017-05-20 23:59:00'],
              dtype='datetime64[ns]', length=7200, freq=None)