<a href="https://colab.research.google.com/github/smmr1405020/NIDS-Project/blob/pranto_1/NSL_KDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [7]:
#metadata and constants
columns = (['duration'
,'protocol_type'
,'service'
,'flag'
,'src_bytes'
,'dst_bytes'
,'land'
,'wrong_fragment'
,'urgent'
,'hot'
,'num_failed_logins'
,'logged_in'
,'num_compromised'
,'root_shell'
,'su_attempted'
,'num_root'
,'num_file_creations'
,'num_shells'
,'num_access_files'
,'num_outbound_cmds'
,'is_host_login'
,'is_guest_login'
,'count'
,'srv_count'
,'serror_rate'
,'srv_serror_rate'
,'rerror_rate'
,'srv_rerror_rate'
,'same_srv_rate'
,'diff_srv_rate'
,'srv_diff_host_rate'
,'dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'
,'attack'
,'level'])

dos_attacks = ['apache2', 'back', 'land', 'neptune', 'mailbomb', 'pod', 'processtable', 'smurf', 'teardrop', 'udpstorm',
               'worm']
probe_attacks = ['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan']
privilege_attacks = ['buffer_overflow', 'loadmdoule', 'perl', 'ps', 'rootkit', 'sqlattack', 'xterm']
access_attacks = ['ftp_write', 'guess_passwd', 'http_tunnel', 'imap', 'multihop', 'named', 'phf', 'sendmail',
                  'snmpgetattack', 'snmpguess', 'spy', 'warezclient', 'warezmaster', 'xclock', 'xsnoop']
attack_labels = ['Normal', 'DoS', 'Probe', 'Privilege', 'Access']

features_to_encode = ['protocol_type', 'service', 'flag']
non_neumeric_features = ['attack', 'level','attack_flag', 'attack_map']
neumeric_features = list(set(columns) - set(non_neumeric_features))
print(len(columns))
print(len(neumeric_features))

43
41


In [8]:
root_path = 'gdrive/My Drive/Colab/NSL-KDD'
train_filename = root_path + '/KDDTrain+.txt'
test_filename = root_path + '/KDDTest+.txt'
train_df = pd.read_csv(train_filename)
test_df = pd.read_csv(test_filename)
train_df.columns = columns
test_df.columns = columns

train_df['attack_flag'] = train_df.attack.map(lambda a: 0 if a == 'normal' else 1)
test_df['attack_flag'] = test_df.attack.map(lambda a: 0 if a == 'normal' else 1)

def map_attack(attack):
    if attack in dos_attacks:
        # dos_attacks map to 1
        attack_type = 1
    elif attack in probe_attacks:
        # probe_attacks mapt to 2
        attack_type = 2
    elif attack in privilege_attacks:
        # privilege escalation attacks map to 3
        attack_type = 3
    elif attack in access_attacks:
        # remote access attacks map to 4
        attack_type = 4
    else:
        # normal maps to 0
        attack_type = 0

    return attack_type

train_df['attack_map'] = train_df.attack.apply(map_attack)
test_df['attack_map'] = test_df.attack.apply(map_attack)

print(train_df.shape)
print(test_df.shape)

(125972, 45)
(22543, 45)


In [9]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

for feature in features_to_encode:
  le = preprocessing.LabelEncoder()
  le.fit(train_df[feature])
  train_df[feature] = pd.DataFrame(le.transform(train_df[feature]))
  test_df[feature] = pd.DataFrame(le.transform(test_df[feature]))

scaler = MinMaxScaler()
scaler.fit(train_df[neumeric_features])
train_df[neumeric_features] = scaler.transform(train_df[neumeric_features])
test_df[neumeric_features] = scaler.transform(test_df[neumeric_features])

train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['attack_map'])

train_x = train_df[neumeric_features]
test_x = test_df[neumeric_features]
val_x = val_df[neumeric_features]

train_by = train_df['attack_flag']
test_by = test_df['attack_flag']
val_by = val_df['attack_flag']

# train_by = pd.DataFrame(to_categorical(train_by, 2))
# test_by = pd.DataFrame(to_categorical(test_by, 2))
# val_by = pd.DataFrame(to_categorical(val_by, 2))

train_my = train_df['attack_map']
test_my = test_df['attack_map']
val_my = val_df['attack_map']

# train_my = pd.DataFrame(to_categorical(train_my, 5))
# test_my = pd.DataFrame(to_categorical(test_my, 5))
# val_my = pd.DataFrame(to_categorical(val_my, 5))

print(train_x.shape, train_by.shape, train_my.shape)
print(test_x.shape, test_by.shape, test_my.shape)
print(val_x.shape, val_by.shape, val_my.shape)

all_x = pd.concat([train_x, test_x, val_x],ignore_index=True)
all_by = pd.concat([train_by, test_by, val_by],ignore_index=True)
all_my = pd.concat([train_my, test_my, val_my],ignore_index=True)

print(all_x.shape, all_by.shape, all_my.shape)

(100777, 41) (100777,) (100777,)
(22543, 41) (22543,) (22543,)
(25195, 41) (25195,) (25195,)
(148515, 41) (148515,) (148515,)


In [31]:
import lightgbm as lgbm


clf = lgbm.LGBMClassifier()
clf.fit(train_x, train_by)
pred_by = clf.predict(test_x)
accuracy = metrics.accuracy_score(test_by, pred_by)
print(accuracy)

0.7892028567626315


In [10]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from keras.utils.np_utils import to_categorical

encoding_dim_11 = 30
encoding_dim_12 = 5

input_11 = Input(shape=(train_x.shape[1],))
encoded_11 = Dense(encoding_dim_11, activation='relu')(input_11)
encoder_11 = Model(input_11, encoded_11)

decoded_11 = Dense(train_x.shape[1], activation='sigmoid')(encoded_11)
autoencoder_11 = Model(input_11, decoded_11)

autoencoder_11.compile(optimizer='adam', loss='mean_squared_error')

autoencoder_11.fit(train_x, train_x, epochs=100, batch_size=256,
 shuffle=True, validation_data=(val_x, val_x))

encoded_train_x = encoder_11.predict(train_x)
encoded_val_x = encoder_11.predict(val_x)

input_12 = Input(shape=(encoding_dim_11,))
encoded_12 = Dense(encoding_dim_12, activation='relu')(input_12)
encoder_12 = Model(input_12, encoded_12)

decoded_12 = Dense(encoding_dim_11, activation='sigmoid')(encoded_12)
autoencoder_12 = Model(input_12, decoded_12)

autoencoder_12.compile(optimizer='adam', loss='mean_squared_error')

autoencoder_12.fit(encoded_train_x, encoded_train_x, epochs=100, batch_size=256,
 shuffle=True, validation_data=(encoded_val_x, encoded_val_x))


Epoch 1/100
Epoch 2/100
  1/394 [..............................] - ETA: 0s - loss: 0.0111

KeyboardInterrupt: ignored

In [11]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from keras.utils.np_utils import to_categorical

def train_fold(train_x,train_y,val_x,val_y,test_x,test_y):
  encoding_dim = [100,50,20,10]

  input = Input(train_x.shape[1])
  encoded_0 = Dense(encoding_dim[0], activation='relu')(input)
  encoded_1 = Dense(encoding_dim[1], activation='relu')(encoded_0)
  encoded_2 = Dense(encoding_dim[2], activation='relu')(encoded_1)

  encoded_3 = Dense(encoding_dim[3], activation='relu')(encoded_2)

  decoded_2 = Dense(encoding_dim[2], activation='relu')(encoded_3)
  decoded_1 = Dense(encoding_dim[1], activation='relu')(decoded_2)
  decoded_0 = Dense(encoding_dim[0], activation='relu')(decoded_1)
  output = Dense(train_x.shape[1], activation='sigmoid')(decoded_0)

  autoencoder_1 = Model(input, output)
  autoencoder_1.compile(optimizer='adam', loss='mean_squared_error')

  autoencoder_1.fit(train_x, train_x, epochs=100, batch_size=256,
  shuffle=True, validation_data=(val_x, val_x))

  multi_classifier = Dense(5, activation='softmax')(encoded_3)
  stage_1 = Model(input, multi_classifier)

  stage_1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
  stage_1.fit(train_x, train_y, epochs=100, batch_size=256,
  shuffle=True, validation_data=(val_x, val_y))
  result=stage_1.predict(test_x)
  print(result)
  return result



In [12]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
skf.get_n_splits(all_x, all_my)
results = []
tests = []

for train_index, test_index in skf.split(all_x, all_my):
  train_x = all_x.iloc[train_index, :]
  test_x = all_x.iloc[test_index, :]
  train_y = all_my.iloc[train_index]
  test_y = all_my.iloc[test_index]
  train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, stratify=train_y)
  train_y = pd.DataFrame(to_categorical(train_y, 5))
  test_y = pd.DataFrame(to_categorical(test_y, 5))
  val_y = pd.DataFrame(to_categorical(val_y, 5))
  tests.append(test_y)
  result = train_fold(train_x, train_y, val_x, val_y, test_x, test_y)
  results.append(result)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [19]:
preds = []
labels = []
for result in results:
  preds.append(pd.DataFrame(result))

preds = pd.concat(preds)
labels = pd.concat(tests)
preds = preds.values
labels = labels.values
print(preds.shape)
print(labels.shape)

(148515, 5)
(148515, 5)


In [29]:
print(preds[0])
print(labels[0])
# import pickle
# pred_file = root_path + '/preds.pkl'
# label_file = root_path + '/labels.pkl'
# with open(pred_file, "wb") as f:
#     pickle.dump(preds, f)
# with open(label_file, "wb") as f:
#     pickle.dump(labels, f)

pred_category = []
label_category = []

for i in range(preds.shape[0]):
  pred_category.append(np.argmax(preds[i]))
  label_category.append(np.argmax(labels[i]))

print(len(pred_category))
print(len(label_category))



[3.2824542e-02 1.7932927e-05 9.6714938e-01 1.3479242e-06 6.8395361e-06]
[0. 0. 1. 0. 0.]
148515
148515


In [37]:
import sklearn.metrics as metrics
acc = metrics.accuracy_score(label_category, pred_category)
print(acc)
print(metrics.confusion_matrix(label_category, pred_category))
print(metrics.classification_report(label_category, pred_category))
print(metrics.precision_score(label_category, pred_category, average='macro'))

0.9911254755411911
[[76562    82   206    13   343]
 [   60 53303    19     0     4]
 [  159     9 13902     0     7]
 [   46     0     0    50    12]
 [  341     1     9     7  3380]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     77206
           1       1.00      1.00      1.00     53386
           2       0.98      0.99      0.99     14077
           3       0.71      0.46      0.56       108
           4       0.90      0.90      0.90      3738

    accuracy                           0.99    148515
   macro avg       0.92      0.87      0.89    148515
weighted avg       0.99      0.99      0.99    148515

0.9180904024290344
