In [1]:
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random
import sklearn
import time
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.ConvergenceWarning)

In [3]:
df = pd.read_csv("../preprocessing/iot23_combined.csv", skiprows=lambda x: x > 0 and random.random() >= 0.05)
# 13 lakh instances need a too many hours to run
del df["Unnamed: 0"]
df

Unnamed: 0,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label,proto_icmp,...,conn_state_RSTOS0,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR
0,0.000000,0,0,0.0,1.0,60.0,0.0,0.0,PartOfAHorizontalPortScan,0,...,0,0,0,1,0,0,0,0,0,0
1,0.000000,0,0,0.0,1.0,60.0,0.0,0.0,PartOfAHorizontalPortScan,0,...,0,0,0,1,0,0,0,0,0,0
2,0.000000,0,0,0.0,1.0,60.0,0.0,0.0,PartOfAHorizontalPortScan,0,...,0,0,0,1,0,0,0,0,0,0
3,2.999056,0,0,0.0,3.0,180.0,0.0,0.0,PartOfAHorizontalPortScan,0,...,0,0,0,1,0,0,0,0,0,0
4,2.999050,0,0,0.0,3.0,180.0,0.0,0.0,PartOfAHorizontalPortScan,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67129,0.000000,0,0,0.0,0.0,0.0,0.0,0.0,DDoS,0,...,0,0,0,0,0,0,0,0,0,0
67130,0.000000,0,0,0.0,0.0,0.0,0.0,0.0,DDoS,0,...,0,0,0,0,0,0,0,0,0,0
67131,0.000000,0,0,0.0,0.0,0.0,0.0,0.0,DDoS,0,...,0,0,0,0,0,0,0,0,0,0
67132,0.000000,0,0,0.0,0.0,0.0,0.0,0.0,DDoS,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df['label'].value_counts()

PartOfAHorizontalPortScan     41094
Benign                         9958
Okiru                          8165
DDoS                           6985
C&C                             733
Attack                          179
C&C-HeartBeat                    13
C&C-FileDownload                  5
C&C-Torii                         1
C&C-HeartBeat-FileDownload        1
Name: label, dtype: int64

In [5]:
X = df[['duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2', 'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR']]
Y = df['label']

In [6]:
scaler = MinMaxScaler()
normalized_x = scaler.fit_transform(X)
normalized_x

array([[1.98444518e-03, 3.98406375e-04, 4.43285990e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.98444518e-03, 3.98406375e-04, 4.43285990e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.98444518e-03, 3.98406375e-04, 4.43285990e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.98444518e-03, 3.98406375e-04, 4.43285990e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.98444518e-03, 3.98406375e-04, 4.43285990e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.98444518e-03, 3.98406375e-04, 4.43285990e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(normalized_x, Y, random_state= 10, test_size=0.2)

In [8]:
print('program start...')
print()
start = time.time()
SVM_classifier = SVC(C=1.0, cache_size=1000, verbose=True).fit(X_train, Y_train)
print(SVM_classifier.score(X_test, Y_test))
y_pred = SVM_classifier.predict(X_test)
print(y_pred)
end = time.time()
print('program end...')
print()
print('time cost: ')
print(end - start, 'seconds')

program start...

[LibSVM]0.7515453936098905
['PartOfAHorizontalPortScan' 'Attack' 'PartOfAHorizontalPortScan' ...
 'Benign' 'PartOfAHorizontalPortScan' 'PartOfAHorizontalPortScan']
program end...

time cost: 
191.25554037094116 seconds


In [8]:
rep = classification_report(Y_test, y_pred, output_dict = True)
d = pd.DataFrame(rep).transpose()
d.to_csv("../Model Results/SVM.csv")