Loading useful packages

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split
import networkx as nx
import collections
import math
import matplotlib.pyplot as plt
import glob
from sklearn.preprocessing import StandardScaler

Loading data sets and combining them

In [23]:
g = glob.glob('F:\\SWaT Dataset\\*.csv')

In [24]:
dataframes = []

for filename in g[490:500]:
    print("loading file: {}".format(filename))
    dataframes.append(pd.read_csv(filename,error_bad_lines = False, sep=","))

loading file: F:\SWaT Dataset\2015-12-29_190411_104.log.part01_sorted.csv
loading file: F:\SWaT Dataset\2015-12-29_190411_104.log.part02_sorted.csv
loading file: F:\SWaT Dataset\2015-12-29_190411_104.log.part03_sorted.csv
loading file: F:\SWaT Dataset\2015-12-29_190411_104.log.part04_sorted.csv
loading file: F:\SWaT Dataset\2015-12-29_190411_104.log.part05_sorted.csv
loading file: F:\SWaT Dataset\2015-12-29_190411_104.log.part06_sorted.csv
loading file: F:\SWaT Dataset\2015-12-29_190411_104.log.part07_sorted.csv
loading file: F:\SWaT Dataset\2015-12-29_190411_104.log.part08_sorted.csv
loading file: F:\SWaT Dataset\2015-12-29_190411_104.log.part09_sorted.csv
loading file: F:\SWaT Dataset\2015-12-29_190411_104.log.part10_sorted.csv


In [25]:
df = pd.concat(dataframes,ignore_index=True)
df.head()

Unnamed: 0,num,date,time,orig,type,i/f_name,i/f_dir,src,dst,proto,appi_name,proxy_src_ip,Modbus_Function_Code,Modbus_Function_Description,Modbus_Transaction_ID,SCADA_Tag,Modbus_Value,service,s_port,Tag
0,1,29Dec2015,13:39:41,192.168.1.48,log,eth1,outbound,192.168.1.10,192.168.1.20,tcp,CIP_read_tag_service,192.168.1.10,76.0,Read Tag Service,62639.0,HMI_FIT201,Number of Elements: 1,44818.0,54592.0,0
1,2,29Dec2015,13:39:41,192.168.1.48,log,eth1,outbound,192.168.1.30,192.168.1.40,tcp,CIP_read_tag_service,192.168.1.30,76.0,Read Tag Service - Response,7644.0,HMI_LIT401,0x03 0xa7 0x66 0x44; 0x00 0x00 0x00 0x00; 0x00...,44818.0,52544.0,0
2,3,29Dec2015,13:39:41,192.168.1.48,log,eth1,outbound,192.168.1.60,192.168.1.10,tcp,CIP_read_tag_service,192.168.1.60,76.0,Read Tag Service,13765.0,HMI_LIT101,Number of Elements: 1,44818.0,53260.0,0
3,4,29Dec2015,13:39:41,192.168.1.48,log,eth1,outbound,192.168.1.60,192.168.1.20,tcp,CIP_read_tag_service,192.168.1.60,76.0,Read Tag Service,57718.0,HMI_AIT202,Number of Elements: 1,44818.0,53250.0,0
4,5,29Dec2015,13:39:41,192.168.1.48,log,eth1,outbound,192.168.1.60,192.168.1.10,tcp,CIP_read_tag_service,192.168.1.60,76.0,Read Tag Service - Response,13765.0,HMI_LIT101,0x44 0x5e 0x02 0x44; 0x00 0x00 0x00 0x00; 0x00...,44818.0,53260.0,0


Dropping columns which are not useful for analysis

In [26]:
df = df.drop([ "date", "time",'num'], axis=1)
df = df[df['i/f_dir']=='outbound']
df = df[df['src'].notna()]
df = df[df['dst'].notna()]
df = df[df['orig'].notna()]
df = df[df['proxy_src_ip'].notna()]
df = df[df['Modbus_Function_Code'].notna()]
df = df[df['Modbus_Function_Description'].notna()]
df = df[df['Modbus_Transaction_ID'].notna()]
df = df[df['SCADA_Tag'].notna()]
df = df[df['appi_name'].notna()]
df = df[df['proto'].notna()]
df = df[df['type'].notna()]
df = df[df['Modbus_Value'].notna()]

Converting hexadecimal to numeric using our function

In [27]:
import binascii, struct

def modbus_conversion(value):
    lst = []
    for i in range(len(value.split(";"))):
        x = value.split(";")[i]
        if " " in x:
            x = x.replace(" ","")
        if '0x' in x:
            x = x.replace("0x","")
            x = struct.unpack('<f',binascii.unhexlify(x))[0]
        lst.append(x)
    return lst[0]

In [28]:
df = df[df['Modbus_Value'] != 'Number of Elements: 1']
df['Modbus_Value'] = list(map(modbus_conversion,df['Modbus_Value']))
df = df.drop(['i/f_dir','i/f_name'],axis=1)

Converting IP addresses to numeric

In [29]:
import socket, struct

def ip2long(ip):
    """
    Convert an IP string to long
    """
    packedIP = socket.inet_aton(ip)
    return struct.unpack("!L", packedIP)[0]


df['src'] = list(map(ip2long,df['src']))
df['dst'] = list(map(ip2long,df['dst']))
df['orig'] = list(map(ip2long,df['orig']))
df['proxy_src_ip'] = list(map(ip2long,df['proxy_src_ip']))

df[['src','dst','orig','proxy_src_ip','Modbus_Value','Modbus_Function_Code','Modbus_Transaction_ID','s_port','service']] = StandardScaler().fit_transform(df[['src','dst','orig','proxy_src_ip','Modbus_Value','Modbus_Function_Code','Modbus_Transaction_ID','s_port','service']])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Factorising certain columns

In [30]:
df['SCADA_Tag'] = pd.factorize(df.SCADA_Tag)[0]
df['Modbus_Function_Description'] = pd.factorize(df.Modbus_Function_Description)[0]
df['appi_name'] = pd.factorize(df.appi_name)[0]
df['proto'] = pd.factorize(df.proto)[0]
df['type'] = pd.factorize(df.type)[0]

Train test split

In [31]:
y = df['Tag']
df = df.drop('Tag',axis=1)
x = df.values
X_train, X_test, y_train, y_test =train_test_split(x,y, test_size = 0.25)

In [10]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
from keras.utils import to_categorical
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from scipy import stats
# fix random seed for reproducibility
np.random.seed(7)

Using TensorFlow backend.


Writing the model

In [32]:
model = Sequential()
model.add(Dense(56, input_dim=14, activation='sigmoid'))
model.add(Dense(56, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=10000)
scaledtorscores = model.evaluate(X_train, y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Generating predictions and calculating confusion matrix

In [33]:
predictions = model.predict(X_test)

lt = [1]*len(y_test)

for i in range(len(predictions)):
    if predictions[i] >= 0.5:
        lt[i] = 1
    else:
        lt[i] = 0
        

matrix = confusion_matrix(y_test,lt)
matrix

array([[590688,      0],
       [ 34271,      0]], dtype=int64)