# Prepare

In [1]:
# import libraries
import json
import os
import os.path
import pandas as pd
import pyreadr

In [2]:
from tqdm.notebook import tqdm

In [3]:
SOURCE = 'Y:\\ZBU\\_Datasets\\TEP\\cyber-attack'  # dataset source dir
TARGET = 'E:\\Datasets\\TEP\\kaspersky\\_pretreated'  # dataset target dir
DTYPES_FILE = 'dtypes.json'  # dtypes of columns

# Stages

In [4]:
def rename_columns(df: pd.DataFrame) -> None:
    columns = [
        "time",
        "inp_a_flow_ksm3h",
        "inp_d_flow_kgh",
        "inp_e_flow_kgh",
        "inp_c_flow_ksm3h",
        "recyl_flow_ksm3h",
        "react_flow_ksm3h",
        "react_press_kpa",
        "react_level_pc",
        "react_temp_gc",
        "purge_flow_ksm3h",
        "seprt_temp_gc",
        "seprt_level_pc",
        "seprt_press_kpa",
        "seprt_flow_m3h",
        "strip_level_pc",
        "strip_press_kpa",
        "prod_flow_m3h",
        "strip_temp_gc",
        "steam_flow_kgh",
        "compr_power_kw",
        "re_cl_temp_gc",
        "co_cl_temp_gc",
        "react_a_prt_molp",
        "react_b_prt_molp",
        "react_c_prt_molp",
        "react_d_prt_molp",
        "react_e_prt_molp",
        "react_f_prt_molp",
        "purge_a_prt_molp",
        "purge_b_prt_molp",
        "purge_c_prt_molp",
        "purge_d_prt_molp",
        "purge_e_prt_molp",
        "purge_f_prt_molp",
        "purge_g_prt_molp",
        "purge_h_prt_molp",
        "prod_d_prt_molp",
        "prod_e_prt_molp",
        "prod_f_prt_molp",
        "prod_g_prt_molp",
        "prod_h_prt_molp",
        "inp_d_feed_pc",
        "inp_e_feed_pc",
        "inp_a_feed_pc",
        "inp_c_feed_pc",
        "compr_valv_pc",
        "purge_feed_pc",
        "seprt_feed_pc",
        "strip_feed_pc",
        "steam_feed_pc",
        "re_cl_feed_pc",
        "co_cl_feed_pc",
        "agit_speed_pc",
        "is_mv_attack",
        "is_meas_attack",
        "is_sp_attack",
        "state",
        "product_rate",
        "hourly_cost",
    ]
    df.columns = columns
    return

In [5]:
def set_index(df: pd.DataFrame) -> None:
    df['time'] = pd.to_datetime(df['time'], unit='h', origin='2022-08-01T00:00:00')
    df.set_index('time', inplace=True)
    return

In [6]:
def downsample(df: pd.DataFrame) -> pd.DataFrame:
    return df.resample('1 min').first()

In [7]:
def trim_features(df: pd.DataFrame) -> None:
    df['attack'] = df['is_mv_attack']*32 + df['is_meas_attack']*64 + df['is_sp_attack']*128
    df.drop(columns=["is_mv_attack", "is_meas_attack", "is_sp_attack", "state", "product_rate", "hourly_cost"],
           inplace=True,
           )
    return

In [8]:
def optimize_dtypes(df: pd.DataFrame) -> None:
    # optimize dataframe by memory usage
    uint_columns = df.columns.values[-2:]  # this columns can be uint
    float_columns = df.columns.values[:-2]  # other must be float
    
    df[uint_columns] = df[uint_columns].apply(pd.to_numeric, downcast='unsigned')
    df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast='float')
    
    # saving our dtypes description for further use
#     dtypes_file = os.path.join(TARGET, DTYPES_FILE)
#     if not os.path.isfile(dtypes_file):
#         # we need to create it
#         names = df.dtypes.index  # columns names
#         types = [c.name for c in df.dtypes]  # columns types
#         dtypes_dict = dict(zip(names, types))  # dict for pandas.read_csv
#         with open(dtypes_file, 'w') as f:
#             json.dump(dtypes_dict, f)
        
    return

# Convert dataset

In [9]:
if not os.path.isdir(os.path.join(TARGET)):
    os.mkdir(TARGET)

subdirs = ('single_states', 
           'transient_processes',
           'attacks',
          )

for sub in subdirs:
    files = os.listdir(os.path.join(SOURCE, sub))
    print(f'{len(files)} files in {sub}')
    for f in tqdm([csv for csv in files if csv.endswith('.csv')]):
        data = pd.read_csv(os.path.join(SOURCE, sub, f), header=None)
        rename_columns(data)
        set_index(data)
        data = downsample(data)
        trim_features(data)
        optimize_dtypes(data)
        p = os.path.splitext(f)[0] + '.snappy'
        data.to_parquet(os.path.join(TARGET, p), compression='snappy')

200 files in single_states


  0%|          | 0/200 [00:00<?, ?it/s]

672 files in transient_processes


  0%|          | 0/346 [00:00<?, ?it/s]

142 files in attacks


  0%|          | 0/142 [00:00<?, ?it/s]