# Prepare

In [1]:
# import libraries
import os
import os.path
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
SOURCE = 'Y:\\ZBU\\_Datasets\\SWaT\\SWaT.A1 _ A2_Dec 2015\\Physical'  # dataset source dir
TARGET = 'E:\\\Datasets\\SWaT\\dataset12'  # dataset target dir

# Stages

In [3]:
def rename_columns(df: pd.DataFrame) -> None:
    columns = {
        'Timestamp': 'time',
        'FIT101': 'FIT101_flow_m3h',
        'LIT101': 'LIT101_level_mm',
        'MV101': 'MV101_feed_state',
        'P101': 'P101_pump_state',
        'P102': 'P102_pump_state',
        'AIT201': 'AIT201_conductivity_uScm',
        'AIT202': 'AIT202_acidity_pH',
        'AIT203': 'AIT203_oxidation_mV',
        'FIT201': 'FIT201_flow_m3h',
        'MV201': 'MV201_feed_state',
        'P201': 'P201_pump_state',
        'P202': 'P202_pump_state',
        'P203': 'P203_pump_state',
        'P204': 'P204_pump_state',
        'P205': 'P205_pump_state',
        'P206': 'P206_pump_state',
        'DPIT301': 'DPIT301_diffpressure_kPa',
        'FIT301': 'FIT301_flow_m3h',
        'LIT301': 'LIT301_level_mm',
        'MV301': 'MV301_feed_state',
        'MV302': 'MV302_feed_state',
        'MV303': 'MV303_feed_state',
        'MV304': 'MV304_feed_state',
        'P301': 'P301_pump_state',
        'P302': 'P302_pump_state',
        'AIT401': 'AIT401_hardness_ppm',
        'AIT402': 'AIT402_oxidation_mV',
        'FIT401': 'FIT401_flow_m3h',
        'LIT401': 'LIT401_level_mm',
        'P401': 'P401_pump_state',
        'P402': 'P402_pump_state',
        'P403': 'P403_pump_state',
        'P404': 'P404_pump_state',
        'UV401': 'UV401_sterilizer_state',
        'AIT501': 'AIT501_acidity_pH',
        'AIT502': 'AIT502_oxidation_mV',
        'AIT503': 'AIT503_conductivity_uScm',
        'AIT504': 'AIT504_conductivity_uScm',
        'FIT501': 'FIT501_flow_m3h',
        'FIT502': 'FIT502_flow_m3h',
        'FIT503': 'FIT503_flow_m3h',
        'FIT504': 'FIT504_flow_m3h',
        'P501': 'P501_pump_state',
        'P502': 'P502_pump_state',
        'PIT501': 'PIT501_pressure_kPa',
        'PIT502': 'PIT502_pressure_kPa',
        'PIT503': 'PIT503_pressure_kPa',
        'FIT601': 'FIT601_flow_m3h',
        'P601': 'P601_pump_state',
        'P602': 'P602_pump_state',
        'P603': 'P603_pump_state',
        'Normal/Attack': 'attack',
    }
    df.columns = [v.strip() for v in df.columns.values]
    df.rename(columns=columns, inplace=True)
    return

In [4]:
def set_index(df: pd.DataFrame) -> None:
    df['time'] = pd.to_datetime(df['time'], dayfirst=True)
    df.set_index('time', inplace=True)
    return

In [5]:
def downsample(df: pd.DataFrame) -> pd.DataFrame:
    return df.resample('1 min').first()

In [6]:
def trim_features(df: pd.DataFrame) -> None:
    attack = {
        'Normal': 0,
        'Attack': 1,
    }
    df['attack'] = df['attack'].replace(attack)
    return

In [7]:
def optimize_dtypes(df: pd.DataFrame) -> None:
    # optimize dataframe by memory usage
    uint_columns = [  # this columns can be uint
        'MV101_feed_state',
        'P101_pump_state', 
        'P102_pump_state', 
        'MV201_feed_state', 
        'P201_pump_state', 
        'P202_pump_state',
        'P203_pump_state', 
        'P204_pump_state', 
        'P205_pump_state',
        'P206_pump_state', 
        'MV301_feed_state', 
        'MV302_feed_state',
        'MV303_feed_state', 
        'MV304_feed_state', 
        'P301_pump_state',
        'P302_pump_state', 
        'P401_pump_state',
        'P402_pump_state', 
        'P403_pump_state', 
        'P404_pump_state',
        'UV401_sterilizer_state', 
        'P501_pump_state', 
        'P502_pump_state',
        'P601_pump_state', 
        'P602_pump_state',
        'P603_pump_state', 
        'attack',
    ]    
    float_columns = [  # other must be float
        'FIT101_flow_m3h', 
        'LIT101_level_mm', 
        'AIT201_conductivity_uScm',
        'AIT202_acidity_pH', 
        'AIT203_oxidation_mV', 
        'FIT201_flow_m3h',
        'DPIT301_diffpressure_kPa', 
        'FIT301_flow_m3h',
        'LIT301_level_mm', 
        'AIT401_hardness_ppm', 
        'AIT402_oxidation_mV',
        'FIT401_flow_m3h', 
        'LIT401_level_mm', 
        'AIT501_acidity_pH', 
        'AIT502_oxidation_mV',
        'AIT503_conductivity_uScm', 
        'AIT504_conductivity_uScm',
        'FIT501_flow_m3h', 
        'FIT502_flow_m3h', 
        'FIT503_flow_m3h',
        'FIT504_flow_m3h', 
        'PIT501_pressure_kPa', 
        'PIT502_pressure_kPa', 
        'PIT503_pressure_kPa',
        'FIT601_flow_m3h', 
    ]
    df[uint_columns] = df[uint_columns].apply(pd.to_numeric, downcast='unsigned')
    df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast='float')
    return

# Convert dataset

In [8]:
if not os.path.isdir(os.path.join(TARGET)):
    os.mkdir(TARGET)

files = ('SWaT_Dataset_Normal_v1.xlsx',
         'SWaT_Dataset_Attack_v0.xlsx',
         'SWaT_Dataset_Normal_v0.xlsx',
        )

for f in tqdm(files):
    filename = os.path.join(SOURCE, f)
    data = pd.read_excel(filename, engine='openpyxl', skiprows=1)
    rename_columns(data)
    set_index(data)
    data = downsample(data)
    trim_features(data)
    optimize_dtypes(data)
    p = os.path.splitext(f)[0] + '.snappy'
    data.to_parquet(os.path.join(TARGET, p), compression='snappy')

  0%|          | 0/3 [00:00<?, ?it/s]

  warn(msg)
  warn(msg)
  warn(msg)
