In [1]:
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier  
from sklearn.tree import DecisionTreeClassifier

In [2]:
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("../preprocessing/iot23_combined.csv")
del df["Unnamed: 0"]
df

Unnamed: 0,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label,proto_icmp,...,conn_state_RSTOS0,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR
0,2.998796,0,0,0.0,3.0,180.0,0.0,0.0,PartOfAHorizontalPortScan,0,...,0,0,0,1,0,0,0,0,0,0
1,0.000000,0,0,0.0,1.0,60.0,0.0,0.0,PartOfAHorizontalPortScan,0,...,0,0,0,1,0,0,0,0,0,0
2,0.000000,0,0,0.0,1.0,60.0,0.0,0.0,PartOfAHorizontalPortScan,0,...,0,0,0,1,0,0,0,0,0,0
3,2.998804,0,0,0.0,3.0,180.0,0.0,0.0,Benign,0,...,0,0,0,1,0,0,0,0,0,0
4,0.000000,0,0,0.0,1.0,60.0,0.0,0.0,Benign,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344670,0.000000,0,0,0.0,0.0,0.0,0.0,0.0,DDoS,0,...,0,0,0,0,0,0,0,0,0,0
1344671,0.000000,0,0,0.0,0.0,0.0,0.0,0.0,DDoS,0,...,0,0,0,0,0,0,0,0,0,0
1344672,0.000000,0,0,0.0,0.0,0.0,0.0,0.0,DDoS,0,...,0,0,0,0,0,0,0,0,0,0
1344673,0.000000,0,0,0.0,0.0,0.0,0.0,0.0,DDoS,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = df[['duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2', 'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR']]
Y = df['label']

In [5]:
scaler = MinMaxScaler()
normalized_x = scaler.fit_transform(X)
normalized_x

array([[8.16450401e-05, 5.73121586e-10, 8.57558209e-08, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.04174057e-05, 5.73121586e-10, 8.57558209e-08, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.04174057e-05, 5.73121586e-10, 8.57558209e-08, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.04174057e-05, 5.73121586e-10, 8.57558209e-08, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.04174057e-05, 5.73121586e-10, 8.57558209e-08, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.04174057e-05, 5.73121586e-10, 8.57558209e-08, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(normalized_x, Y, random_state= 100, test_size=0.2)

In [7]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier(n_neighbors = 15)
# params_knn = {'n_neighbors': np.arange(1, 25)}
# knn_gs = GridSearchCV(knn, params_knn, cv=5)
# knn_gs.fit(X_train, Y_train)
knn.fit(X_train, Y_train)
#save best model
# knn_best = knn_gs.best_estimator_#check best n_neigbors value
# print(knn_gs.best_params_)
# y_pred = knn.predict(X_test)

In [8]:
clf = RandomForestClassifier(n_estimators= 10, criterion="entropy")  
clf.fit(X_train, Y_train) 
# print(clf.score(X_train, Y_train))

In [9]:
from sklearn.linear_model import LogisticRegression#create a new logistic regression model
log_reg = LogisticRegression()#fit the model to the training data
log_reg.fit(X_train, Y_train)

In [10]:
print('knn: {}'.format(knn.score(X_test, Y_test)))
print('rf: {}'.format(clf.score(X_test, Y_test)))
print('log_reg: {}'.format(log_reg.score(X_test, Y_test)))

knn: 0.7840556268243256
rf: 0.7840779370479856
log_reg: 0.7441575101790395


In [11]:
from sklearn.ensemble import VotingClassifier#create a dictionary of our models
estimators=[('knn', knn), ('rf', clf), ('log_reg', log_reg)]#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')

In [12]:
#fit model to training data
ensemble.fit(X_train, Y_train)#test our model on the test data
ensemble.score(X_test, Y_test)

0.7840333166006656