In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os

In [3]:
os.chdir("/content/drive/My Drive/BTP/NSL-KDD")
os.listdir()

['KDDTest+.txt',
 'KDDTest+.arff',
 'KDDTest1.jpg',
 'index.html',
 'KDDTrain+.arff',
 'KDDTest-21.txt',
 'KDDTrain+.txt',
 'KDDTrain+_20Percent.arff',
 'KDDTest-21.arff',
 'KDDTrain1.jpg',
 'KDDTrain+_20Percent.txt',
 'Y_pred.csv',
 'Xtrain_set',
 'Ytrain_set',
 'Xtest_set',
 'Ytest_set',
 'XX',
 'YY',
 'Y5',
 'dataset.csv',
 'tsne.csv',
 'Y_train',
 'Y_test',
 'X_test',
 'Attack_labels',
 'Y_test_attack',
 'Y_train_attack',
 'X_train',
 'tree.dot',
 'tree.png',
 'model.png',
 'checkpoint',
 'final_model',
 'final_model_5',
 'completed_model',
 'only_type1',
 'both_1_and_2',
 'experience_replay_buffer',
 'experience_replay_buffer.pkl',
 'finetune_accuracy.pkl',
 'er_accuracy.pkl',
 'der_accuracy.pkl',
 'finetune_accuracy1.pkl',
 'er_accuracy_final.pkl',
 'lwf.pkl']

In [4]:
labels = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
'num_access_files', 'num_outbound_cmds', 'is_host_login',
'is_guest_login', 'count', 'srv_count', 'serror_rate',
'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
'dst_host_srv_rerror_rate', 'attack_type', 'difficulty_level']

In [5]:
train_data = pd.read_csv("KDDTrain+.txt", header=None, names=labels)
test_data = pd.read_csv("KDDTest+.txt", header = None, names=labels)

In [6]:
Y_train = train_data['attack_type']
Y_test = test_data['attack_type']

In [7]:
train_attacks = set(train_data['attack_type'])
test_attacks = set(test_data['attack_type'])

Certain attack types only belong to the test set and are absent in the train set. 

In [9]:
s1 = train_attacks.difference(test_attacks)
s2 = test_attacks.difference(train_attacks)
common = test_attacks.intersection(train_attacks)

In [15]:
all = train_attacks.union(test_attacks)
len(all), len(train_attacks), len(test_attacks)

(40, 23, 38)

In [19]:
len(common), len(s1), len(s2)

(21, 2, 17)

In [18]:
for i in test_attacks:
  print(i, end=', ')

nmap, sendmail, teardrop, warezmaster, snmpguess, saint, apache2, perl, ipsweep, worm, land, xterm, satan, udpstorm, normal, guess_passwd, phf, neptune, smurf, httptunnel, named, ps, mailbomb, snmpgetattack, sqlattack, rootkit, loadmodule, xsnoop, processtable, portsweep, mscan, ftp_write, buffer_overflow, pod, xlock, back, imap, multihop, 

16.63% of datapoints belongs to attack types that are exclusive to the test data set which is very high compared to the train set where only 0.7% belongs exclusively to the train set. 

In [20]:
count = 0
for x in test_data['attack_type']:
  if x in common:
    count += 1

count/test_data.shape[0]

0.8336586231369766

In [21]:
count = 0
for x in train_data['attack_type']:
  if x in common:
    count += 1

count/train_data.shape[0]

0.9929191175886897

In [None]:
train_size = train_data.shape[0]

The second feature has different entries for the train and the test set, hence converting them to one-hot vectors creates different sizes separately, thus merging and splitting the data. 


In [None]:
len(set(train_data['service'])), len(set(test_data['service']))

(70, 64)

Create the datasets for analysing and visualizing the attacks. 

In [None]:
combined = [train_data, test_data]
combined = pd.concat(combined)
combined = combined.drop(columns = ['attack_type', 'difficulty_level'], axis=1)
combined

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.00,0.00,0.00,150,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.00,255,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.00,255,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.00,0.00,0.00,30,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.00,0.00,0.09,255,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,0,tcp,smtp,SF,794,333,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.00,0.00,0.00,100,141,0.72,0.06,0.01,0.01,0.01,0.00,0.00,0.00
22540,0,tcp,http,SF,317,938,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,11,0.0,0.0,0.0,0.0,1.00,0.00,0.18,197,255,1.00,0.00,0.01,0.01,0.01,0.00,0.00,0.00
22541,0,tcp,http,SF,54540,8314,0,0,0,2,0,1,1,0,0,0,0,0,0,0,0,0,5,10,0.0,0.0,0.0,0.0,1.00,0.00,0.20,255,255,1.00,0.00,0.00,0.00,0.00,0.00,0.07,0.07
22542,0,udp,domain_u,SF,42,42,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,6,0.0,0.0,0.0,0.0,1.00,0.00,0.33,255,252,0.99,0.01,0.00,0.00,0.00,0.00,0.00,0.00


In [None]:
categorical_features = []
for col in combined.columns:
  if(combined[col].dtype == object):
    categorical_features.append(col)
print(categorical_features)

['protocol_type', 'service', 'flag']


In [None]:
combined_encoded = pd.get_dummies(combined, columns=categorical_features, prefix = categorical_features)

In [None]:
cc = combined_encoded.columns

In [None]:
cc

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       ...
       'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0',
       'flag_S1', 'flag_S2', 'flag_S3', 'flag_SF', 'flag_SH'],
      dtype='object', length=122)

In [None]:
combined_encoded = np.array(combined_encoded)
X_train = combined_encoded[0:train_size,:]
X_test = combined_encoded[train_size:len(combined_encoded),:]

In [None]:
X_train

array([[0.000e+00, 4.910e+02, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [0.000e+00, 1.460e+02, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [0.000e+00, 2.231e+03, 3.840e+02, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [0.000e+00, 1.510e+02, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

The following are the final processed and scaled X datasets. 

In [None]:
from sklearn.preprocessing import Normalizer

sc_x = Normalizer()
sc_x = sc_x.fit(X_train)

X_train = sc_x.fit_transform(X_train)
X_test = sc_x.fit_transform(X_test)

Performing encoding on the Y datasets. 

In [None]:
def convert(y):
  YY = []
  normal = 0
  DOS = ['back','land','neptune','pod','smurf', 'teardrop','mailbomb','processtable','udpstorm','apache2','worm']
  Probing = ['ipsweep' , 'nmap' , 'portsweep' , 'satan', 'mscan', 'saint']
  R2L = ['ftp_write' , 'guess_passwd' , 'imap' , 'multihop' , 'phf' , 'spy' , 'warezclient' , 'warezmaster','xlock','xsnoop','snmpguess','snmpgetattack','httptunnel','sendmail', 'named']
  U2R = ['buffer_overflow' , 'loadmodule' , 'perl' , 'rootkit','sqlattack','xterm','ps']
  for i in y:
    if i=='normal':
      YY.append('0')
      normal += 1
    elif i in DOS: 
      YY.append('1')
    elif i in Probing:
      YY.append('2')
    elif i in R2L:
      YY.append('3')
    elif i in U2R:
      YY.append('4')
    else:
      print('Error')

  return YY

In [None]:
YY_train = convert(Y_train)

import collections 

for (key, value) in collections.Counter(np.array(YY_train)).items(): 
        print (key, " -> ", value) 

print('---------------------------------------------------------------')

YY_test = convert(Y_test)


for (key, value) in collections.Counter(np.array(YY_test)).items(): 
        print (key, " -> ", value) 

0  ->  67343
1  ->  45927
3  ->  995
2  ->  11656
4  ->  52
---------------------------------------------------------------
1  ->  7460
0  ->  9711
2  ->  2421
3  ->  2885
4  ->  67


In [None]:
pd.DataFrame(YY_train).to_csv('Y_train')
pd.DataFrame(YY_test).to_csv('Y_test')

X_train = pd.DataFrame(X_train)
X_train.columns = cc

X_test = pd.DataFrame(X_test)
X_test.columns = cc

X_train.to_csv('X_train')
X_test.to_csv('X_test')

In [None]:
train_attack_points = train_data['attack_type']
test_attack_points = test_data['attack_type']

combined_points = pd.concat([train_attack_points, test_attack_points])

from sklearn import preprocessing
lencoder = preprocessing.LabelEncoder()
encoded_labels = lencoder.fit_transform(np.array(combined_points))

assert (len(set.union(train_attacks, test_attacks)) == np.max(encoded_labels) + 1), "Mistake -- attacks not matching"

pd.DataFrame(encoded_labels[0:train_size]).to_csv('Y_train_attack')
pd.DataFrame(encoded_labels[train_size:]).to_csv('Y_test_attack')

attacks = []
attacks.append(lencoder.inverse_transform(np.arange(0,40)))
pd.DataFrame(attacks).to_csv('Attack_labels')
