In [88]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model

In [4]:
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

In [5]:
def cf(y_true, y_pred, fname, title):
    '''
    Create a confusion matrix plot save it into a file
    Inputs:
    y_true: actual labels
    y_pred: prediction labels
    fname: file name
    title: title of the plot
    Outputs:
    None
    '''
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = '0'
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    ax = sns.heatmap(cm, cmap= "Blues", annot=annot, fmt='')
    ax.set_title(title);
    ax.xaxis.set_ticklabels(['Normal','Attack'])
    ax.yaxis.set_ticklabels(['Normal','Attack'])
    plt.savefig(fname, dpi=300)
    plt.show()

In [6]:
def preproc(X, y):
    '''
    Pre-process the data
    Inputs: 
    X: input features
    y: input labels
    Outputs:
    X_train: training samples
    X_val: validation samples
    X_test: testing samples 
    y_train: training labels
    y_val: validation labels
    y_test: testing labels
    
    '''
    # split the data into train, validation, and test sets
    X_dev, X_test, y_dev, y_test = train_test_split( X, y, stratify = y, test_size = 0.2) # Split the data into test and development sets
    X_train, X_val, y_train, y_val = train_test_split( X_dev, y_dev, stratify = y_dev, test_size = 0.2) #split the development set into the train and validation
    
    # handle missing values
    imputer = SimpleImputer(missing_values = np.NaN , strategy= 'mean') # Define the imputer with mean strategy
    X_train = imputer.fit_transform(X_train) # Fit the imputer on the training data and transfer it
    X_val = imputer.transform(X_val) # Transform the validation data using the imputer
    X_test = imputer.transform(X_test) # Transform the testing data using the imputer
    
    # normalize the data
    scaler = MinMaxScaler()   # Define the scaler
    X_train = scaler.fit_transform(X_train)   # Fit the scaler on the data
    X_val = scaler.transform(X_val)   # Apply the trained scaler on the validation data
    X_test = scaler.transform(X_test)   # Apply the trained scaler on the test data
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [7]:
def rep_learning(X, rep = 8):
    '''
    This function defines an autoencoder representation learning model
    Inputs:
    X: input features
    rep: new representation dims
    Outputs:
    model: encoder of the autoencoder
    
    '''
    #Encoder part
    input_X= Input(shape = (X.shape[1],)) # Input shape of the data
    encoded = Dense(units = 200, activation = 'tanh')(input_X)
    encoded = Dense(units = 64, activation = 'tanh')(encoded)
    encoded = Dense(units = rep, activation = 'tanh')(encoded) # Representation layer
    #Decoder part
    decoded = Dense(units = 64, activation = 'tanh')(encoded)
    decoded = Dense(units = 200, activation = 'tanh')(encoded)
    decoded = Dense(units = X.shape[1], activation = 'tanh')(decoded)
    
    autoencoder = Model(input_X, decoded)
    autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['mse'])
    
    autoencoder.fit(X, X, epochs = 100, batch_size = 256, shuffle = True)
    
    # Making a model using the encoder part
    model = Sequential() 
    model.add(autoencoder.layers[0]) 
    model.add(autoencoder.layers[1]) 
    model.add(autoencoder.layers[2]) 
    model.add(autoencoder.layers[3])

    return model

In [2]:
#Load a single file as a numpy array
def load_file(filepath):
    data = np.load(filepath)
    return data

In [5]:
#Load network data
data1 = load_file("d1.npy")
data2 = load_file("d2.npy")
data3 = load_file("d28.npy")
data4 = load_file("d29.npy")
data5 = load_file("d30.npy")
data6 = load_file("d31.npy")

data1= np.delete(data1,[0, 1, 2, 3],2 )
data2= np.delete(data2,[0, 1, 2, 3],2 )
data3= np.delete(data3,[0, 1, 2, 3],2 )
data4= np.delete(data4,[0, 1, 2, 3],2 )
data5= np.delete(data5,[0, 1, 2, 3, 20],2 )
data6= np.delete(data6,[0, 1, 2, 3, 20],2 )

data = np.concatenate((data1, data2, data3, data4, data5, data6), axis = 0)
data = np.reshape(data,(data.shape[0]*data.shape[1],data.shape[2]))

df = pd.DataFrame(data)
df.drop(13, axis = 1, inplace=True)

labelencoder = LabelEncoder()
df[0]= labelencoder.fit_transform(df[0])
df[1]= labelencoder.fit_transform(df[1])
df[2]= labelencoder.fit_transform(df[2])
df[3]= labelencoder.fit_transform(df[3])
df[4]= labelencoder.fit_transform(df[4])
df[5]= labelencoder.fit_transform(df[5])
df[6]= labelencoder.fit_transform(df[6])
df[7]= labelencoder.fit_transform(df[7])
df[8]= labelencoder.fit_transform(df[8])
df[10]= labelencoder.fit_transform(df[10])
df[12]= labelencoder.fit_transform(df[12])

imputer = SimpleImputer(missing_values = np.NaN , strategy= 'mean') # Define the imputer with mean strategy
df = imputer.fit_transform(df) # Fit the imputer on the training data and transfer it

scaler = MinMaxScaler()
df = scaler.fit_transform(df)

data = np.reshape(df,(df.shape[0]//10,10,df.shape[1]))
data_net = np.asarray(data).astype('float32')

In [33]:
#Load sensor data
data_swat1 = pd.read_csv("1.csv")
data_swat2 = pd.read_csv("2.csv")
data_swat3 = pd.read_csv("28.csv")
data_swat4 = pd.read_csv("29.csv")
data_swat5 = pd.read_csv("30.csv")
data_swat6 = pd.read_csv("31.csv")
f = [data_swat1,data_swat2,data_swat3,data_swat4,data_swat5,data_swat6]
data_swat =pd.concat(f)
data_swat = data_swat.sample(data_net.shape[0])

y_swat = data_swat["Normal/Attack"]
data_swat = data_swat.drop(['Normal/Attack',' Timestamp'] , axis = 1)

y_swat[y_swat=='Normal']=0
y_swat[y_swat=='Attack']=1
y_swat[y_swat=='A ttack']=1

y_swat=np.array(y_swat)
y_swat=y_swat.astype("int64")

# data_swat = data_swat.sample(data_net.shape[0])
data_swat = np.array(data_swat)

In [34]:
data_net.shape

(410422, 10, 16)

In [35]:
data_swat.shape

(410422, 51)

In [68]:
data = []
for i in range(data_net.shape[0]):
    data.append([data_net[i, :, :], data_swat[i, :]])

In [69]:
X_train, X_test, y_train, y_test = train_test_split(data, y_swat, stratify= y_swat, test_size= 0.2)

In [102]:
len(X_train)

328337

In [106]:
X_train_net = []
X_train_sen = []
for i in range (len(X_train)):
    X_train_net.append(X_train[i][0])
    X_train_sen.append(X_train[i][1])
    
X_test_net = []
X_test_sen = []
for i in range(len(X_test)):
    X_test_net.append(X_test[i][0])
    X_test_sen.append(X_test[i][1])

In [109]:
X_train_net = np.array(X_train_net)
X_test_net = np.array(X_test_net)
X_train_sen = np.array(X_train_sen)
X_test_sen = np.array(X_test_sen)

In [110]:
print(X_train_net.shape)
print(X_test_net.shape)
print(X_train_sen.shape)
print(X_test_sen.shape)

(328337, 10, 16)
(82085, 10, 16)
(328337, 51)
(82085, 51)


In [87]:
y_train_oh = to_categorical(y_train, num_classes= 2)  
y_test_oh = to_categorical(y_test, num_classes= 2)

In [112]:
sens_inp = layers.Input(shape = (X_train_sen.shape[1]), name = "Sensor")
x = layers.Dense(512, activation = 'relu')(sens_inp)
x = layers.Dense(128, activation= 'relu')(x)
x = layers.Dense(256, activation = 'relu')(x)
sens_out = layers.Dense(32, activation = 'relu')(x)

net_inp = layers.Input(shape = (X_train_net.shape[1], X_train_net.shape[2]), name = "Network")
x = layers.LSTM(1024, return_sequences= True)(net_inp)
x = layers.LSTM(256)(x)
x = layers.Dense(512, activation = 'relu')(x)
x = layers.Dense(64, activation = 'relu')(x)
net_out = layers.Dense(32, activation= 'relu')(x)

x = layers.concatenate([sens_out, net_out])
x = layers.Dense(32, activation = 'relu')(x)

out = layers.Dense(2, activation = 'softmax')(x)

model = Model(inputs = [net_inp, sens_inp], outputs = out)

model.summary()


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Network (InputLayer)           [(None, 10, 16)]     0           []                               
                                                                                                  
 Sensor (InputLayer)            [(None, 51)]         0           []                               
                                                                                                  
 lstm_6 (LSTM)                  (None, 10, 1024)     4263936     ['Network[0][0]']                
                                                                                                  
 dense_35 (Dense)               (None, 512)          26624       ['Sensor[0][0]']                 
                                                                                            

In [114]:
model.compile(optimizer= 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit([X_train_net, X_train_sen], y_train_oh, epochs= 2, batch_size = 512, validation_data=(X_test, y_test_oh))

Epoch 1/2
 94/642 [===>..........................] - ETA: 8:46 - loss: 0.5079 - accuracy: 0.9422

KeyboardInterrupt: 