# USAD

## Environment

In [None]:
!rm -r sample_data

In [None]:
!git clone https://github.com/manigalati/usad

In [1]:
import torch.utils.data as data_utils
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from contextlib2 import redirect_stdout
from utils import *
from usad import *
from pathlib import Path

## 파라미터 세팅

In [2]:
DATANAME = 'SWaT' # 데이터 이름은 HAI, SWaT, WADI 중에서 선택
FILENUM = 'USAD_front'
window_size= 12
BATCH_SIZE =  2048
N_EPOCHS = 100
hidden_size = 100
alpha=.5 
beta =.5

In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi -L

device = get_default_device()

In [None]:
def dataframe_from_csv(target):
    return pd.read_csv(target).rename(columns=lambda x: x.strip())

def dataframe_from_csvs(targets):
    return pd.concat([dataframe_from_csv(x) for x in targets])

def normalize(df):
    ndf = df.copy()
    for c in df.columns:
        if TAG_MIN[c] == TAG_MAX[c]:
            ndf[c] = df[c] - TAG_MIN[c]
        else:
            ndf[c] = (df[c] - TAG_MIN[c]) / (TAG_MAX[c] - TAG_MIN[c])
    return ndf

## Data PreProcessing

### Download dataset

In [None]:
# Data Load & Merge function
def data_load_merge (dataname) :
    if dataname == 'HAI': # train data에서 80%만 가져온다.
        train_data = sorted([x for x in Path("data/HAI 2.0/training/").glob("*.csv.gz")]) 
        train_data= dataframe_from_csvs(train_data)
        length = int(train_data.shape[0]*0.8)
        train_data = train_data.iloc[:length]
       
    elif dataname == 'SWaT' :
        train_data = pd.read_csv("data/SWaT_Dataset_Normal_v1.csv")

    elif dataname == 'WADI': 
        train_data = pd.read_csv("data/WADI_14days_new.csv")
        length = int(train_data.shape[0]*0.8)
        train_data = train_data.iloc[:length]
        
    return train_data

In [None]:
normal_original = data_load_merge(DATANAME) 
print(normal_original.shape)

### Normal period

In [None]:

from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

# HAI data preprocessing
if DATANAME == 'HAI' :
    columns = normal_original.columns
    DROP_FIELD = ["time", 
                columns[2], columns[3], columns[14], columns[18], columns[19], columns[21], columns[22], columns[25], columns[33], columns[34], columns[35], columns[37], columns[40], columns[43], columns[51], columns[52], columns[59], columns[61], columns[63], columns[64], columns[65], columns[67],
                columns[4], columns[5], columns[6], columns[7], columns[8], columns[10], columns[11], columns[17], columns[24], columns[28], columns[32], columns[44], columns[46], columns[48], columns[49], columns[50], columns[53], columns[58], columns[62], columns[71], columns[76], columns[78], columns[79]]

    VALID_COLUMNS_IN_TRAIN_DATASET = normal_original.columns.drop(DROP_FIELD) # DROP_FIELD를 통해 normalization에 사용하지 않을 변수를 제거함.
    
    TAG_MIN = normal_original[VALID_COLUMNS_IN_TRAIN_DATASET].min()
    TAG_MAX = normal_original[VALID_COLUMNS_IN_TRAIN_DATASET].max()

    # Min-Max Normalize
    normal = normalize(normal_original[VALID_COLUMNS_IN_TRAIN_DATASET]).ewm(alpha=0.9).mean()

# SWaT data preprocessing
if DATANAME == 'SWaT' :
    columns = normal_original.columns
    DROP_FIELD = ["Timestamp", "Normal/Attack",'P102','P201','P202','P204','P206','P401','P403','P404','P502','P601','P603','P301']
  
    VALID_COLUMNS_IN_TRAIN_DATASET = normal_original.columns.drop(DROP_FIELD)
    normal = normal_original[VALID_COLUMNS_IN_TRAIN_DATASET]

    # Transform all columns into float64
    for i in list( normal): 
         normal[i]= normal[i].apply(lambda x: str(x).replace("," , "."))
    normal =  normal.astype(float)

    # Transform all columns into float64
    x = normal.values
    x_scaled = min_max_scaler.fit_transform(x)
    normal = pd.DataFrame(x_scaled)

# WADI data preprocessing
if DATANAME == 'WADI' :
    columns = normal_original.columns
    DROP_FIELD = ["Time", "Date","Row", '2_LS_001_AL', '2_LS_002_AL', '2_P_001_STATUS', '2_P_002_STATUS','1_LS_001_AL','1_LS_002_AL','1_MV_002_STATUS','1_MV_003_STATUS','1_P_002_STATUS','1_P_004_STATUS','1_P_006_STATUS','2_P_004_STATUS','2_PIC_003_SP','2_SV_101_STATUS','2_SV_201_STATUS','2_SV_301_STATUS'
    ,'2_SV_401_STATUS','2_SV_501_STATUS','2_SV_601_STATUS','3_LS_001_AL','3_MV_001_STATUS','3_MV_002_STATUS','3_MV_003_STATUS','3_P_001_STATUS','3_P_002_STATUS','3_P_003_STATUS','3_P_004_STATUS','PLANT_START_STOP_LOG',
    '2_MV_001_STATUS', '2_MV_002_STATUS','2_MV_004_STATUS', '2_MV_005_STATUS', '2_MV_009_STATUS']

    VALID_COLUMNS_IN_TRAIN_DATASET = normal_original.columns.drop(DROP_FIELD) # DROP_FIELD를 통해 normalization에 사용하지 않을 변수를 제거함.
    normal = normal_original[VALID_COLUMNS_IN_TRAIN_DATASET]

    # Min-Max Normalize
    x = normal.values
    x_scaled = min_max_scaler.fit_transform(x)
    normal = pd.DataFrame(x_scaled)


In [None]:
print("normal data shape : ", normal.shape)

## Windows_normal

In [None]:
windows_normal=normal.values[np.arange(window_size)[None, :] + np.arange(normal.shape[0]-window_size)[:, None]]

## Training

In [None]:

file_name =FILENUM+'_'+str(BATCH_SIZE)+'_'+str(N_EPOCHS)+'_'+str(hidden_size)
w_size=windows_normal.shape[1]*windows_normal.shape[2] # 612
z_size=windows_normal.shape[1]*hidden_size #1200

windows_normal_train = windows_normal
windows_normal_val = windows_normal[int(np.floor(.8 *  windows_normal.shape[0])):int(np.floor(windows_normal.shape[0]))]

In [None]:


train_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(
    torch.from_numpy(windows_normal_train).float().view(([windows_normal_train.shape[0],w_size]))
) , batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

val_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(
    torch.from_numpy(windows_normal_val).float().view(([windows_normal_val.shape[0],w_size]))
) , batch_size=BATCH_SIZE, shuffle=False, num_workers=0)


model = UsadModel(w_size, z_size)
model = to_device(model,device)

In [None]:
history = training(N_EPOCHS,model,train_loader,val_loader)

In [None]:
torch.save({
            'encoder': model.encoder.state_dict(),
            'decoder1': model.decoder1.state_dict(),
            'decoder2': model.decoder2.state_dict()
            },'usad_hai/'+'usad_front_'+"model.pth")

## training data prediction

In [None]:

model = UsadModel(w_size, z_size)
model = to_device(model,device)

checkpoint = torch.load('usad_hai/'+'usad_front_'+"model.pth")
model.encoder.load_state_dict(checkpoint['encoder'])
model.decoder1.load_state_dict(checkpoint['decoder1'])
model.decoder2.load_state_dict(checkpoint['decoder2'])

In [None]:
results1=testing_stacking(model, train_loader, alpha=alpha, beta=beta)

In [None]:
y_pred1=torch.stack(results1[:-1]).detach().cpu().numpy()
y_pred2=results1[-1].detach().cpu().numpy()

print('y_pred1.shape',y_pred1.shape)
print('y_pred2.shape',y_pred2.shape)

In [None]:
shape1, shape2 = y_pred1.shape[0]*y_pred1.shape[1],y_pred1.shape[2]
y_pred1 = y_pred1.reshape(shape1,shape2)
print(y_pred1.shape)
print(y_pred2.shape)

In [None]:
usad_pred =np.concatenate([y_pred1,y_pred2])
shape1 = windows_normal_train.shape[0]
shape2 = windows_normal_train.shape[1]
shape3 = windows_normal_train.shape[2]
usad_pred = usad_pred.reshape(shape1,shape2,shape3)
print(usad_pred.shape)

In [None]:
np.save('usad_hai/usad_swat_train.npy', usad_pred)

## Testing

In [3]:
# Data Load & Merge function
def data_load_merge (dataname) :
    if dataname == 'HAI' :
        VALIDATION_DATASET = sorted([x for x in Path("data/HAI 2.0/validation/").glob("*.csv.gz")])
        validation_data = dataframe_from_csvs(VALIDATION_DATASET)

    elif dataname == 'SWaT' :
        validation_data = pd.read_csv("data/SWaT_Dataset_Attack_v0.csv",sep=";")

    elif dataname == 'WADI': 
        validation_data = pd.read_csv("data/WADI_attackdataLABLE.csv", header=1)

    return validation_data

In [4]:
def dataframe_from_csv(target):
    return pd.read_csv(target).rename(columns=lambda x: x.strip())

def dataframe_from_csvs(targets):
    return pd.concat([dataframe_from_csv(x) for x in targets])

def normalize(df):
    ndf = df.copy()
    for c in df.columns:
        if TAG_MIN[c] == TAG_MAX[c]:
            ndf[c] = df[c] - TAG_MIN[c]
        else:
            ndf[c] = (df[c] - TAG_MIN[c]) / (TAG_MAX[c] - TAG_MIN[c])
    return ndf

In [5]:
attack_original= data_load_merge(DATANAME) 

  if (await self.run_code(code, result,  async_=asy)):


In [6]:

from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

# HAI data preprocessing
if DATANAME == 'HAI' :
    columns = attack_original.columns
    DROP_FIELD = ["time", 
                columns[2], columns[3], columns[14], columns[18], columns[19], columns[21], columns[22], columns[25], columns[33], columns[34], columns[35], columns[37], columns[40], columns[43], columns[51], columns[52], columns[59], columns[61], columns[63], columns[64], columns[65], columns[67],
                columns[4], columns[5], columns[6], columns[7], columns[8], columns[10], columns[11], columns[17], columns[24], columns[28], columns[32], columns[44], columns[46], columns[48], columns[49], columns[50], columns[53], columns[58], columns[62], columns[71], columns[76], columns[78], columns[79]]

    VALID_COLUMNS_IN_TRAIN_DATASET = attack_original.columns.drop(DROP_FIELD) # DROP_FIELD를 통해 normalization에 사용하지 않을 변수를 제거함.
    
    TAG_MIN = attack_original[VALID_COLUMNS_IN_TRAIN_DATASET].min()
    TAG_MAX = attack_original[VALID_COLUMNS_IN_TRAIN_DATASET].max()

    # Min-Max Normalize
    attack = normalize(attack_original[VALID_COLUMNS_IN_TRAIN_DATASET])


# SWaT data preprocessing
if DATANAME == 'SWaT' :
    columns = attack_original.columns
    DROP_FIELD = ["Timestamp", "Normal/Attack",'P102','P201','P202','P204','P206','P401','P403','P404','P502','P601','P603','P301']
  
    VALID_COLUMNS_IN_TRAIN_DATASET = attack_original.columns.drop(DROP_FIELD)
    attack = attack_original[VALID_COLUMNS_IN_TRAIN_DATASET]

    # Transform all columns into float64
    for i in list(attack): 
        attack[i]=attack[i].apply(lambda x: str(x).replace("," , "."))
    attack= attack.astype(float)

    x = attack.values
    x_scaled = min_max_scaler.fit_transform(x)
    attack = pd.DataFrame(x_scaled)

# WADI data preprocessing
if DATANAME == 'WADI' :
    columns = attack_original.columns
    DROP_FIELD = ["Time", "Date","Row", '2_LS_001_AL', '2_LS_002_AL', '2_P_001_STATUS', '2_P_002_STATUS','1_LS_001_AL','1_LS_002_AL','1_MV_002_STATUS','1_MV_003_STATUS','1_P_002_STATUS','1_P_004_STATUS','1_P_006_STATUS','2_P_004_STATUS','2_PIC_003_SP','2_SV_101_STATUS','2_SV_201_STATUS','2_SV_301_STATUS'
    ,'2_SV_401_STATUS','2_SV_501_STATUS','2_SV_601_STATUS','3_LS_001_AL','3_MV_001_STATUS','3_MV_002_STATUS','3_MV_003_STATUS','3_P_001_STATUS','3_P_002_STATUS','3_P_003_STATUS','3_P_004_STATUS','PLANT_START_STOP_LOG',
    '2_MV_001_STATUS', '2_MV_002_STATUS','2_MV_004_STATUS', '2_MV_005_STATUS', '2_MV_009_STATUS']
    TIMESTAMP_FIELD = "Time"
    ATTACK_FIELD = "attack"

    VALID_COLUMNS_IN_TRAIN_DATASET = attack_original.columns.drop(DROP_FIELD) # DROP_FIELD를 통해 normalization에 사용하지 않을 변수를 제거함.
    attack = attack_original[VALID_COLUMNS_IN_TRAIN_DATASET]

    # Min-Max Normalize
    x = attack.values
    x_scaled = min_max_scaler.fit_transform(x)
    attack = pd.DataFrame(x_scaled)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attack[i]=attack[i].apply(lambda x: str(x).replace("," , "."))


In [7]:
windows_attack=attack.values[np.arange(window_size)[None, :] + np.arange(attack.shape[0]-window_size)[:, None]]
windows_attack.shape

(449907, 12, 39)

In [8]:
#w_size=windows_normal.shape[1]*windows_normal.shape[2] # 612
#z_size=windows_normal.shape[1]*hidden_size #1200
w_size = windows_attack.shape[1]*windows_attack.shape[2]
z_size = windows_attack.shape[1]*hidden_size
model = UsadModel(w_size, z_size)
model = to_device(model,device)

In [9]:

test_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(
    torch.from_numpy(windows_attack).float().view(([windows_attack.shape[0],w_size]))
) , batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

In [10]:
checkpoint = torch.load('usad_hai/'+'usad_front_'+"model.pth")
model.encoder.load_state_dict(checkpoint['encoder'])
model.decoder1.load_state_dict(checkpoint['decoder1'])
model.decoder2.load_state_dict(checkpoint['decoder2'])
results2=testing_stacking(model, test_loader,alpha=alpha, beta=beta)

In [11]:
y_pred1=torch.stack(results2[:-1]).detach().cpu().numpy()
y_pred2=results2[-1].detach().cpu().numpy()

print('y_pred1.shape',y_pred1.shape)
print('y_pred2.shape',y_pred2.shape)

y_pred1.shape (219, 2048, 468)
y_pred2.shape (1395, 468)


In [12]:
shape1, shape2 = y_pred1.shape[0]*y_pred1.shape[1],y_pred1.shape[2]
y_pred1 = y_pred1.reshape(shape1,shape2)
print(y_pred1.shape)
print(y_pred2.shape)

usad_test_pred =np.concatenate([y_pred1,y_pred2])
shape1 = windows_attack.shape[0]
shape2 = windows_attack.shape[1]
shape3 = windows_attack.shape[2]
usad_test_pred = usad_test_pred.reshape(shape1,shape2,shape3)
print("BACK 모델 test "+DATANAME+" data 결과 저장",usad_test_pred.shape)

np.save('usad_hai/usad_swat_test.npy', usad_test_pred)

(448512, 468)
(1395, 468)
BACK 모델 test SWaT data 결과 저장 (449907, 12, 39)
