In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import issparse
import tensorflow as tf

train_url = 'https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Train.csv'
test_url = 'https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Test.csv'

# Column names 
col_names = ["duration", "protocol_type", "service", "flag", "src_bytes",
    "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]

df_train = pd.read_csv(train_url, header=None, names=col_names)
df_test = pd.read_csv(test_url, header=None, names=col_names)



pd.DataFrame(df_train[:10])


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
5,0,tcp,private,REJ,0,0,0,0,0,0,...,19,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune
6,0,tcp,private,S0,0,0,0,0,0,0,...,9,0.04,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
7,0,tcp,private,S0,0,0,0,0,0,0,...,15,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0,neptune
8,0,tcp,remote_job,S0,0,0,0,0,0,0,...,23,0.09,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
9,0,tcp,private,S0,0,0,0,0,0,0,...,13,0.05,0.06,0.0,0.0,1.0,1.0,0.0,0.0,neptune


In [2]:
# Analyze the distribution of labels
print("Label distribution:")
print(df_train['label'].value_counts())


Label distribution:
label
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: count, dtype: int64


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import issparse
import tensorflow as tf


df_train = pd.read_csv(train_url, header=None, names=col_names)
df_test = pd.read_csv(test_url, header=None, names=col_names)

# Separate features and labels
X_train = df_train.drop(columns=["label"])
y_train = df_train["label"]
X_test = df_test.drop(columns=["label"])
y_test = df_test["label"]

# Convert labels to binary values (attack or normal)
normal_class = "normal"
y_train_binary = (y_train == normal_class).astype(int)
y_test_binary = (y_test == normal_class).astype(int)

# Identify numeric and categorical features
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

# Preprocess the data (standardize numeric features, one-hot encode categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Apply preprocessing to the training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_train_preprocessed, y_train_binary, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a RandomForest model for feature importance
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Select the top N important features
N = 30
important_features_indices = indices[:N]
print("Most important features:")
for i in range(N):
    print(f"Feature {important_features_indices[i]}: Importance = {importances[important_features_indices[i]]}")

# Select the top N features from the preprocessed data
X_train_selected = X_train[:, important_features_indices]
X_val_selected = X_val[:, important_features_indices]
X_test_selected = X_test[:, important_features_indices]

# Create TensorFlow datasets
batch_size = 4
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_selected, y_train)).shuffle(len(X_train)).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val_selected, y_val)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_selected, y_test)).batch(batch_size)

# Display label distribution
print("Label distribution in training data:", np.bincount(y_train))
print("Label distribution in validation data:", np.bincount(y_val))
print("Label distribution in test data:", np.bincount(y_test))

Most important features:
Feature 1: Importance = 0.15520953518611455
Feature 2: Importance = 0.0884779301351478
Feature 120: Importance = 0.07354677360438859
Feature 30: Importance = 0.05761846204844106
Feature 29: Importance = 0.05660369618393082
Feature 25: Importance = 0.0524436978780386
Feature 8: Importance = 0.052407501303191335
Feature 35: Importance = 0.04405410135605501
Feature 26: Importance = 0.041864541998465316
Feature 19: Importance = 0.028800014110174427
Feature 31: Importance = 0.02836411010299619
Feature 32: Importance = 0.0245037658345944
Feature 38: Importance = 0.022862189362426167
Feature 33: Importance = 0.02170416849862373
Feature 90: Importance = 0.01906012068088183
Feature 116: Importance = 0.01839283846458572
Feature 65: Importance = 0.017409809978082575
Feature 34: Importance = 0.016983846171802115
Feature 28: Importance = 0.015532036787499811
Feature 20: Importance = 0.015187459429422302
Feature 55: Importance = 0.014135936101537511
Feature 22: Importance = 

In [4]:

numeric_features = numeric_features.astype(str).tolist()
categorical_features = categorical_features.astype(str).tolist()

transformed_feature_names = (
    numeric_features + 
    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
)
print("Transformed feature names:", transformed_feature_names)

print("Most important features:")
for i in range(N):
    feature_index = important_features_indices[i]
    feature_name = transformed_feature_names[feature_index]
    print(f"Feature Index: {feature_index}, Column Number: {feature_index + 1}, Name: {feature_name}, Importance = {importances[feature_index]}")

X_train_selected_df = pd.DataFrame(X_train_selected, columns=[transformed_feature_names[i] for i in important_features_indices])

print("Training data with selected features:")
display(X_train_selected_df.head())

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

Transformed feature names: ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp', 'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_dom

Unnamed: 0,src_bytes,dst_bytes,flag_SF,dst_host_same_srv_rate,dst_host_srv_count,same_srv_rate,logged_in,dst_host_srv_serror_rate,diff_srv_rate,count,...,service_eco_i,srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,service_ecr_i,protocol_type_tcp,num_compromised,serror_rate,hot,protocol_type_udp
0,-0.007726,-0.001204,1.0,1.066401,1.258754,0.771283,1.235694,-0.624871,-0.349683,-0.594783,...,0.0,-0.631929,-0.387635,-0.376387,0.0,1.0,-0.011664,-0.637209,-0.095076,0.0
1,-0.007762,-0.004919,0.0,-1.138756,-1.026654,-1.480656,-0.809262,1.618955,-0.01693,0.505575,...,0.0,1.605104,-0.387635,-0.376387,0.0,1.0,-0.011664,1.602664,-0.095076,0.0
2,-0.007762,-0.004919,0.0,-1.094207,-0.972455,-1.435162,-0.809262,-0.624871,-0.01693,1.649598,...,0.0,-0.631929,2.87441,2.753914,0.0,1.0,-0.011664,-0.637209,-0.095076,0.0
3,-0.007718,-0.004854,1.0,1.066401,1.258754,0.771283,1.235694,-0.624871,-0.349683,-0.559851,...,0.0,-0.631929,-0.387635,-0.376387,0.0,1.0,-0.011664,-0.637209,-0.095076,0.0
4,-0.007674,-0.004918,1.0,1.066401,1.258754,0.771283,-0.809262,-0.624871,-0.349683,1.658331,...,0.0,-0.631929,-0.387635,-0.376387,0.0,0.0,-0.011664,-0.637209,-0.095076,1.0


Numeric features: ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
Categorical features: ['protocol_type', 'service', 'flag']


In [10]:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses

# Define input and output dimensions
input_dim = X_train_selected.shape[1]  # Number of features
output_dim = 2  # Number of classes (0 = normal, 1 = attack)

# Reshape data to be suitable for Conv1D (3D input)
X_train_conv = X_train_selected.reshape((X_train_selected.shape[0], 1, X_train_selected.shape[1]))
X_val_conv = X_val_selected.reshape((X_val_selected.shape[0], 1, X_val_selected.shape[1]))
X_test_conv = X_test_selected.reshape((X_test_selected.shape[0], 1, X_test_selected.shape[1]))

# Define the Conv1D model
model = models.Sequential([
    layers.Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(1, input_dim), padding='same'),
    layers.BatchNormalization(),

    layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
    layers.BatchNormalization(),

    layers.Conv1D(filters=256, kernel_size=3, activation='relu', padding='same'),
    layers.BatchNormalization(),

    layers.Conv1D(filters=512, kernel_size=3, activation='relu', padding='same'),  # New layer
    layers.BatchNormalization(),

    # Flatten layer to flatten the features
    layers.Flatten(),

    # Fully Connected layers
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),

    # Output layer with softmax for multi-class classification
    layers.Dense(output_dim, activation='softmax')
])

# Compile the model with SparseCategoricalCrossentropy
model.compile(optimizer=optimizers.Adam(learning_rate=0.0005),
              loss=losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

# Load model weights
model.load_weights("cnn_nsl_98.keras")

# # Train the model
# num_epochs = 5  # Increase the number of epochs for a deeper model
# early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# model.fit(X_train_conv, y_train, validation_data=(X_val_conv, y_val),
#           epochs=num_epochs, batch_size=8, callbacks=[early_stopping])  # Increase batch_size

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test_conv, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9830 - loss: 0.0570
Test Accuracy: 98.46%


In [9]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.models import Model

print(X_train_selected.shape)

X_train_seq = tf.expand_dims(X_train_selected, axis=1)
X_val_seq = tf.expand_dims(X_val_selected, axis=1)
X_test_seq = tf.expand_dims(X_test_selected, axis=1)

print(X_test_seq.shape)

# Settings
units = 64  # Number of units for each LSTM
input_dim = X_train_selected.shape[1]  # Number of input features

# ===== ENCODER =====
encoder_inputs = Input(shape=(1, input_dim))
encoder_lstm_1 = LSTM(units, return_sequences=True, return_state=True)
encoder_lstm_2 = LSTM(units, return_sequences=True, return_state=True)
encoder_lstm_3 = LSTM(units, return_sequences=False, return_state=True)

# Passing data through Encoder layers
_, state_h1, state_c1 = encoder_lstm_1(encoder_inputs)
_, state_h2, state_c2 = encoder_lstm_2(encoder_inputs)
encoder_outputs, state_h3, state_c3 = encoder_lstm_3(encoder_inputs)

# Storing the last layer's state (for Decoder)
encoder_states = [state_h3, state_c3]

# ===== DECODER =====
decoder_inputs = Input(shape=(1, input_dim))
decoder_lstm_1 = LSTM(units, return_sequences=True, return_state=True)
decoder_lstm_2 = LSTM(units, return_sequences=True, return_state=True)
decoder_lstm_3 = LSTM(units, return_sequences=True, return_state=True)

# Passing data through Decoder layers
decoder_lstm_1_out, _, _ = decoder_lstm_1(decoder_inputs, initial_state=[state_h1, state_c1])
decoder_lstm_2_out, _, _ = decoder_lstm_2(decoder_lstm_1_out, initial_state=[state_h2, state_c2])
decoder_outputs, _, _ = decoder_lstm_3(decoder_lstm_2_out, initial_state=encoder_states)

# Final output through Dense layer
decoder_dense = Dense(2, activation="softmax")  # Specify the number of classes instead of features
decoder_outputs = decoder_dense(decoder_outputs)

# ===== Building the Model =====
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model using sparse_categorical_crossentropy and accuracy
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])



# Load model weights
model.load_weights("LSTM_nSL_99.keras")



# # ===== Training the Model =====
# epochs = 3
# batch_size = 8

# history = model.fit(
#     [X_train_seq, X_train_seq],  # Input includes both encoder and decoder
#     y_train,                    # Target labels as numerical values
#     batch_size=batch_size,
#     epochs=epochs,
#     validation_data=([X_val_seq, X_val_seq], y_val),
# )

# # Display accuracy results
# print(f"Train Accuracy: {history.history['accuracy'][-1]:.4f}")
# print(f"Validation Accuracy: {history.history['val_accuracy'][-1]:.4f}")

# ===== Testing the Model =====
test_loss, test_accuracy = model.evaluate([X_test_seq, X_test_seq], y_test)  # Use y_test instead of X_test_seq
print(f"Test Accuracy: {test_accuracy:.4f}")

# Predicting on validation data
y_pred = model.predict([X_val_seq, X_val_seq])

# Display 5 sample predictions and actual labels
for i in range(5):
    print(f"Sample {i + 1} - True label: {y_val.iloc[i]}, Predicted label: {np.argmax(y_pred[i], axis=-1)}")


(100778, 30)
(12598, 1, 30)


  saveable.load_own_variables(weights_store.get(inner_path))


[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9918 - loss: 0.0224
Test Accuracy: 0.9927
[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
Sample 1 - True label: 1, Predicted label: [1]
Sample 2 - True label: 0, Predicted label: [0]
Sample 3 - True label: 0, Predicted label: [0]
Sample 4 - True label: 0, Predicted label: [0]
Sample 5 - True label: 0, Predicted label: [0]


In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
import numpy as np

# Predict labels for validation data
y_val_pred_probs = model.predict([X_val_seq, X_val_seq])  # Predicted probabilities
y_val_pred = np.argmax(y_val_pred_probs, axis=-1)         # Predicted labels

# Predict labels for test data
y_test_pred_probs = model.predict([X_test_seq, X_test_seq])
y_test_pred = np.argmax(y_test_pred_probs, axis=-1)

# ===== Validation Metrics =====
precision_val = precision_score(y_val, y_val_pred, average="binary")
recall_val = recall_score(y_val, y_val_pred, average="binary")
f1_val = f1_score(y_val, y_val_pred, average="binary")

print("Validation Metrics:")
print(f"Precision: {precision_val:.4f}")
print(f"Recall: {recall_val:.4f}")
print(f"F1-Score: {f1_val:.4f}")

# ===== Test Metrics =====
precision_test = precision_score(y_test, y_test_pred, average="binary")
recall_test = recall_score(y_test, y_test_pred, average="binary")
f1_test = f1_score(y_test, y_test_pred, average="binary")

print("\nTest Metrics:")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

# ===== Classification Report =====
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_test_pred, target_names=["Normal", "Anomaly"]))


[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m394/394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Validation Metrics:
Precision: 0.9931
Recall: 0.9933
F1-Score: 0.9932

Test Metrics:
Precision: 0.9923
Recall: 0.9941
F1-Score: 0.9932

Classification Report (Test Data):
              precision    recall  f1-score   support

      Normal       0.99      0.99      0.99      5873
     Anomaly       0.99      0.99      0.99      6725

    accuracy                           0.99     12598
   macro avg       0.99      0.99      0.99     12598
weighted avg       0.99      0.99      0.99     12598

