In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from time import time

In [2]:
# Feature names from the file kddcup.names file to be used as cols heading
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [3]:
kddcup_data_10_percent_corrected = pd.read_csv("kddcup.data_10_percent_corrected", header=None, names = col_names)

In [4]:
kddcup_data_10_percent_corrected.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [5]:
kddcup_data_10_percent_corrected

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.00,0.00,0.00,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494016,0,tcp,http,SF,310,1881,0,0,0,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal.
494017,0,tcp,http,SF,282,2286,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal.
494018,0,tcp,http,SF,203,1200,0,0,0,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal.
494019,0,tcp,http,SF,291,1200,0,0,0,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal.


In [6]:
# Checking whether any nan entires in dataset exists
kddcup_data_10_percent_corrected.isnull().values.any()

False

In [7]:
kddcup_data_10_percent_corrected['label'].value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: label, dtype: int64

In [8]:
# Considering all the numerical features for training. Categorical features excluded as of now.

num_features = [
    "duration","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate"
]
features = kddcup_data_10_percent_corrected[num_features].astype(float)

In [9]:
features.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,...,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0
mean,47.979302,3025.61,868.5324,4.5e-05,0.006433,1.4e-05,0.034519,0.000152,0.148247,0.010212,...,232.470778,188.66567,0.75378,0.030906,0.601935,0.006684,0.176754,0.176443,0.058118,0.057412
std,707.746472,988218.1,33040.0,0.006673,0.134805,0.00551,0.782103,0.01552,0.355345,1.798326,...,64.74538,106.040437,0.410781,0.109259,0.481309,0.042133,0.380593,0.380919,0.23059,0.23014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,46.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,693375600.0,5155468.0,1.0,3.0,3.0,30.0,5.0,1.0,884.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# reduce the outputs to normal and attack.

labels = kddcup_data_10_percent_corrected['label'].copy()
labels[labels!='normal.'] = 'attack.'
labels.value_counts()

attack.    396743
normal.     97278
Name: label, dtype: int64

In [11]:
features.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.0,181.0,5450.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,9.0,9.0,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0
1,0.0,239.0,486.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,19.0,19.0,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0
2,0.0,235.0,1337.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,29.0,29.0,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
3,0.0,219.0,1337.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,39.0,39.0,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
4,0.0,217.0,2032.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,49.0,49.0,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0


In [12]:
# Feature scaling

features[num_features] = MinMaxScaler().fit_transform(features[num_features])

In [13]:
features.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,...,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0
mean,0.000823,4.363595e-06,0.000168,4.5e-05,0.002144,5e-06,0.001151,3e-05,0.148247,1.2e-05,...,0.91165,0.739865,0.75378,0.030906,0.601935,0.006684,0.176754,0.176443,0.058118,0.057412
std,0.012134,0.001425228,0.006409,0.006673,0.044935,0.001837,0.02607,0.003104,0.355345,0.002034,...,0.253903,0.415845,0.410781,0.109259,0.481309,0.042133,0.380593,0.380919,0.23059,0.23014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,6.489989e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.180392,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,7.499542e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.488371e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
clf = KNeighborsClassifier(n_neighbors = 5, algorithm = 'ball_tree', leaf_size=500)

In [15]:
t0 = time()
clf.fit(features,labels)
tt = time()-t0
print("Classifier trained in {} seconds".format(round(tt,3)))

Classifier trained in 579.65 seconds


In [24]:
kdd_data_corrected = pd.read_csv("corrected", header=None, names = col_names)
kdd_data_corrected['label'].value_counts()

smurf.              164091
normal.              60593
neptune.             58001
snmpgetattack.        7741
mailbomb.             5000
guess_passwd.         4367
snmpguess.            2406
satan.                1633
warezmaster.          1602
back.                 1098
mscan.                1053
apache2.               794
processtable.          759
saint.                 736
portsweep.             354
ipsweep.               306
httptunnel.            158
pod.                    87
nmap.                   84
buffer_overflow.        22
multihop.               18
named.                  17
sendmail.               17
ps.                     16
xterm.                  13
rootkit.                13
teardrop.               12
xlock.                   9
land.                    9
xsnoop.                  4
ftp_write.               3
phf.                     2
worm.                    2
loadmodule.              2
sqlattack.               2
perl.                    2
udpstorm.                2
i

In [25]:
kdd_data_corrected['label'][kdd_data_corrected['label']!='normal.'] = 'attack.'
kdd_data_corrected['label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


attack.    250436
normal.     60593
Name: label, dtype: int64

In [26]:
kdd_data_corrected[num_features] = kdd_data_corrected[num_features].astype(float)

In [27]:
kdd_data_corrected[num_features] = MinMaxScaler().fit_transform(kdd_data_corrected[num_features])

In [29]:
kdd_data_corrected[num_features]

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.0,0.000002,0.000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.996078,1.0,0.01,0.00,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000002,0.000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.996078,1.0,0.01,0.00,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000002,0.000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.996078,1.0,0.01,0.00,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000002,0.000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.996078,1.0,0.01,0.00,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000002,0.000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.996078,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311024,0.0,0.000002,0.000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.000000,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0
311025,0.0,0.000002,0.000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.000000,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0
311026,0.0,0.000002,0.000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.000000,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0
311027,0.0,0.000002,0.000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.000000,1.0,0.00,0.01,0.0,0.0,0.0,0.0,0.0


In [33]:
features_train, features_test, labels_train, labels_test = train_test_split(
    kdd_data_corrected[num_features], 
    kdd_data_corrected['label'], 
    test_size=0.1, 
    random_state=42)

In [35]:
t0 = time()
pred = clf.predict(features_test)
tt = time() - t0
print("Predicted in {} seconds".format(round(tt,3)))

Predicted in 211.511 seconds


In [38]:
acc = accuracy_score(pred, labels_test)
print("R squared is {}.".format(round(acc,4)))

R squared is 0.92.


In [40]:
k = 30
km = KMeans(n_clusters = k)


t0 = time()
km.fit(features)
tt = time()-t0
print("Clustered in {} seconds".format(round(tt,3)))

Clustered in 27.015 seconds


In [41]:
pd.Series(km.labels_).value_counts()

1     280648
2      48556
20     38320
11     23620
12     18964
0      11602
26     10471
3      10083
4       9120
5       4272
18      3592
7       3526
16      3215
24      3209
6       3120
15      2919
27      2586
13      1944
23      1919
17      1877
28      1720
29      1348
14      1218
25      1187
8       1138
9       1016
19      1002
21       970
22       680
10       179
dtype: int64

In [45]:
labels = kddcup_data_10_percent_corrected['label']
label_names = list(map(
    lambda x: pd.Series([labels[i] for i in range(len(km.labels_)) if km.labels_[i]==x]), 
    range(k)))

In [46]:
for i in range(k):
    print("Cluster {} labels:".format(i))
    print(label_names[i].value_counts())
    print()

Cluster 0 labels:
normal.    11471
smurf.       130
nmap.          1
dtype: int64

Cluster 1 labels:
smurf.     280615
normal.        33
dtype: int64

Cluster 2 labels:
neptune.      48552
portsweep.        4
dtype: int64

Cluster 3 labels:
neptune.      10059
portsweep.       24
dtype: int64

Cluster 4 labels:
normal.    9120
dtype: int64

Cluster 5 labels:
normal.       4238
satan.          25
portsweep.       9
dtype: int64

Cluster 6 labels:
normal.    3035
back.        84
satan.        1
dtype: int64

Cluster 7 labels:
normal.         3511
back.             14
warezclient.       1
dtype: int64

Cluster 8 labels:
ipsweep.     813
normal.      139
nmap.         99
pod.          72
land.         14
multihop.      1
dtype: int64

Cluster 9 labels:
normal.     1012
ipsweep.       3
back.          1
dtype: int64

Cluster 10 labels:
satan.        175
portsweep.      3
land.           1
dtype: int64

Cluster 11 labels:
normal.    22358
back.       1258
phf.           3
satan.         1
dt

In [48]:
t0 = time()
pred = km.predict(kdd_data_corrected[num_features])
tt = time() - t0
print("Assigned clusters in {} seconds".format(round(tt,3)))

Assigned clusters in 0.121 seconds
