In [1]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random
import time
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn import preprocessing
import pickle
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_validate

In [2]:
data = pd.read_csv('feature_selection.csv')

In [3]:
data.head(5)

Unnamed: 0,src_bytes,num_access_files,num_shells,flag,service,dst_bytes,srv_rerror_rate,logged_in,srv_serror_rate,su_attempted,...,root_shell,count,is_guest_login,wrong_fragment,is_host_login,serror_rate,num_compromised,num_outbound_cmds,protocol_type,label
0,491,0,0,SF,ftp_data,0,0.0,0,0.0,0,...,0,2,0,0,0,0.0,0,0,tcp,normal
1,146,0,0,SF,other,0,0.0,0,0.0,0,...,0,13,0,0,0,0.0,0,0,udp,normal
2,0,0,0,S0,private,0,0.0,0,1.0,0,...,0,123,0,0,0,1.0,0,0,tcp,neptune
3,232,0,0,SF,http,8153,0.0,1,0.2,0,...,0,5,0,0,0,0.2,0,0,tcp,normal
4,199,0,0,SF,http,420,0.0,1,0.0,0,...,0,30,0,0,0,0.0,0,0,tcp,normal


In [4]:
print('Label Distribution Training Set :')
print(data['label'].value_counts())

Label Distribution Training Set :
normal     67343
neptune    41214
smurf       2646
back         956
pod          201
land          18
Name: label, dtype: int64


In [5]:
print('Dataset :')
for col_name in data.columns:
    if data[col_name].dtypes == 'object' :
        unique_cat =len(data[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))
        
print()
print('Distributed of Categories in Service :')
print(data['service'].value_counts().sort_values(ascending=False).head())

Dataset :
Feature 'flag' has 10 categories
Feature 'service' has 65 categories
Feature 'protocol_type' has 3 categories
Feature 'label' has 6 categories

Distributed of Categories in Service :
http        40304
private     16061
domain_u     9034
smtp         7270
ftp_data     6193
Name: service, dtype: int64


In [6]:
protocol_type_columns=['protocol_type']
protocol_type_values = data[protocol_type_columns]
protocol_type_values.head()

Unnamed: 0,protocol_type
0,tcp
1,udp
2,tcp
3,tcp
4,tcp


In [7]:
service_columns=['service']
service_values = data[service_columns]
service_values.head()

Unnamed: 0,service
0,ftp_data
1,other
2,private
3,http
4,http


In [8]:
flag_columns=['flag']
flag_values = data[flag_columns]
flag_values.head()

Unnamed: 0,flag
0,SF
1,SF
2,S0
3,SF
4,SF


In [9]:
# protocol_type
unique_protocol = sorted(data.protocol_type.unique())
string1 = 'protocol_type_'
unique_protocol2 = [string1 + x for x in unique_protocol]
print(unique_protocol2)

# isi secara bersamaan
dumcols_protocol_type = unique_protocol2
# + unique_service2 + unique_flag2

['protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp']


In [10]:
# service
unique_service = sorted(data.service.unique())
string2 = 'service_'
unique_service2 = [string2 + x for x in unique_service]
print(unique_service2)

dumcols_service = unique_service2

['service_IRC', 'service_X11', 'service_Z39_50', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_hostnames', 'service_http', 'service_http_443', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'service_rje', 'service_shell', 'service_smtp', 'service_sql_net', 'service_ssh', 'service_sunrpc', 'service_supdup', 'service_systat', 'service_telnet', 'service_t

In [11]:
# protocol_type
unique_flag = sorted(data.flag.unique())
string3 = 'flag_'
unique_flag2 = [string3 + x for x in unique_flag]
print(unique_flag2)

dumcols_flag = unique_flag2

['flag_OTH', 'flag_REJ', 'flag_RSTO', 'flag_RSTR', 'flag_S0', 'flag_S1', 'flag_S2', 'flag_S3', 'flag_SF', 'flag_SH']


In [12]:
# konversi fitur katogorial menjadi numerik
protocol_type_values_enc = protocol_type_values.apply(LabelEncoder().fit_transform)
print('Sebelum dikonversi')
print(protocol_type_values.head())
print()
print('Setelah dikonversi')
print(protocol_type_values_enc.head())

Sebelum dikonversi
  protocol_type
0           tcp
1           udp
2           tcp
3           tcp
4           tcp

Setelah dikonversi
   protocol_type
0              1
1              2
2              1
3              1
4              1


In [13]:
# konversi fitur katogorial menjadi numerik
service_values_enc = service_values.apply(LabelEncoder().fit_transform)
print('Sebelum dikonversi')
print(service_values.head())
print()
print('Setelah dikonversi')
print(service_values_enc.head())

Sebelum dikonversi
    service
0  ftp_data
1     other
2   private
3      http
4      http

Setelah dikonversi
   service
0       19
1       40
2       44
3       22
4       22


In [14]:
# konversi fitur katogorial menjadi numerik
flag_values_enc = flag_values.apply(LabelEncoder().fit_transform)
print('Sebelum dikonversi')
print(flag_values.head())
print()
print('Setelah dikonversi')
print(flag_values_enc.head())

Sebelum dikonversi
  flag
0   SF
1   SF
2   S0
3   SF
4   SF

Setelah dikonversi
   flag
0     8
1     8
2     4
3     8
4     8


In [15]:
data['protocol_type'] = protocol_type_values_enc

In [16]:
data['service'] = service_values_enc

In [17]:
data['flag'] = flag_values_enc

In [18]:
label_data = data['label']

# ubah kolom label
new_label_data = label_data.replace({'normal' : 0, 'back' : 1, 'land' : 1, 'pod' : 1, 'neptune' : 1, 'smurf' : 1})
# tambahkan kolom label baru kedalam dataet
data['label'] = new_label_data

print(data)

        src_bytes  num_access_files  num_shells  flag  service  dst_bytes  \
0             491                 0           0     8       19          0   
1             146                 0           0     8       40          0   
2               0                 0           0     4       44          0   
3             232                 0           0     8       22       8153   
4             199                 0           0     8       22        420   
5               0                 0           0     1       44          0   
6               0                 0           0     4       44          0   
7               0                 0           0     4       44          0   
8               0                 0           0     4       46          0   
9               0                 0           0     4       44          0   
10              0                 0           0     1       44          0   
11              0                 0           0     4       44          0   

In [19]:
label = data.iloc[:,-1:].values.ravel()
label

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [20]:
feature = data.iloc[:,:-1]
feature.values

array([[4.910e+02, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [1.460e+02, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        2.000e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       ...,
       [2.231e+03, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [1.510e+02, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00]])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.3)

In [22]:
print("jumlah data latih : ", len(X_train))
print("jumlah data uji : ", len(X_test))

jumlah data latih :  78664
jumlah data uji :  33714


In [23]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=9, p=2,
           weights='uniform')

In [24]:
pred = knn.predict(X_test)
pred

array([1, 1, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
cols = {
    'data benar' : y_test,
    'prediksi' : pred,
}
print("hasil")
pd.DataFrame(cols)

hasil


Unnamed: 0,data benar,prediksi
0,1,1
1,1,1
2,0,0
3,1,1
4,1,1
5,0,0
6,0,0
7,0,0
8,1,1
9,1,1


In [26]:
kfold = KFold(n_splits = 5)
scores = ['accuracy']
results = cross_validate(knn, X_test, y_test, cv=kfold, scoring=scores, return_train_score=True)
results_data = pd.DataFrame(results)
results_data

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
0,0.20396,0.142618,0.99822,0.998591
1,0.204545,0.152627,0.996589,0.99785
2,0.225397,0.154125,0.997479,0.998183
3,0.271274,0.160571,0.996737,0.99848
4,0.219413,0.158506,0.997923,0.998109


In [27]:
print("Performance Model KNN")

accuracy = results_data.test_accuracy.mean() * 100

print("accuracy : %0.2f" % accuracy,"%")

Performance Model KNN
accuracy : 99.74 %
