## Importing librariesl

In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
# pd.set_option('max_columns', 200)

## Load training data

In [2]:
data = []
with open('/kaggle/input/nslkdd/KDDTrain+.arff', "r") as f:
    for line in f:
        line = line.replace('\n', '')
        data.append(line.split(','))


names = ['duration',  'protocol_type', 'service',  'flag',  'src_bytes' , 'dst_bytes' , 'land' , 'wrong_fragment' , 'urgent' , 'hot' , 'num_failed_logins' ,
 'logged_in' , 'num_compromised' , 'root_shell' , 'su_attempted' , 'num_root' , 'num_file_creations' , 'num_shells' , 'num_access_files' , 'num_outbound_cmds' ,
 'is_host_login' , 'is_guest_login' , 'count' , 'srv_count' , 'serror_rate' , 'srv_serror_rate' , 'rerror_rate' , 'srv_rerror_rate' , 'same_srv_rate' ,
 'diff_srv_rate' , 'srv_diff_host_rate' , 'dst_host_count' , 'dst_host_srv_count' , 'dst_host_same_srv_rate' , 'dst_host_diff_srv_rate' , 'dst_host_same_src_port_rate' ,
 'dst_host_srv_diff_host_rate' , 'dst_host_serror_rate', 'dst_host_srv_serror_rate' , 'dst_host_rerror_rate' , 'dst_host_srv_rerror_rate' , 'class' ,]
    
df = pd.DataFrame(data[44:], columns=names)

In [3]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [4]:
df.shape

(125973, 42)

In [5]:
df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'class'],
      dtype='object')

In [6]:
df.dtypes

duration                       object
protocol_type                  object
service                        object
flag                           object
src_bytes                      object
dst_bytes                      object
land                           object
wrong_fragment                 object
urgent                         object
hot                            object
num_failed_logins              object
logged_in                      object
num_compromised                object
root_shell                     object
su_attempted                   object
num_root                       object
num_file_creations             object
num_shells                     object
num_access_files               object
num_outbound_cmds              object
is_host_login                  object
is_guest_login                 object
count                          object
srv_count                      object
serror_rate                    object
srv_serror_rate                object
rerror_rate 

In [7]:
# Define the expected data types based on the research paper
expected_data_types = {
    'duration': 'float64',
    'protocol_type': 'category',
    'service': 'category',
    'flag': 'category',
    'src_bytes': 'float64',
    'dst_bytes': 'float64',
    'land': 'category',
    'wrong_fragment': 'float64',
    'urgent': 'float64',
    'hot': 'float64',
    'num_failed_logins': 'float64',
    'logged_in': 'category',
    'num_compromised': 'float64',
    'root_shell': 'float64',
    'su_attempted': 'float64',
    'num_root': 'float64',
    'num_file_creations': 'float64',
    'num_shells': 'float64',
    'num_access_files': 'float64',
    'num_outbound_cmds': 'float64',
    'is_host_login': 'category',
    'is_guest_login': 'category',
    'count': 'float64',
    'srv_count': 'float64',
    'serror_rate': 'float64',
    'srv_serror_rate': 'float64',
    'rerror_rate': 'float64',
    'srv_rerror_rate': 'float64',
    'same_srv_rate': 'float64',
    'diff_srv_rate': 'float64',
    'srv_diff_host_rate': 'float64',
    'dst_host_count': 'float64',
    'dst_host_srv_count': 'float64',
    'dst_host_same_srv_rate': 'float64',
    'dst_host_diff_srv_rate': 'float64',
    'dst_host_same_src_port_rate': 'float64',
    'dst_host_srv_diff_host_rate': 'float64',
    'dst_host_serror_rate': 'float64',
    'dst_host_srv_serror_rate': 'float64',
    'dst_host_rerror_rate': 'float64',
    'dst_host_srv_rerror_rate': 'float64',
    'class': 'category'
}

# Convert columns to appropriate data types
df = df.astype(expected_data_types)


In [8]:
df.dtypes

duration                        float64
protocol_type                  category
service                        category
flag                           category
src_bytes                       float64
dst_bytes                       float64
land                           category
wrong_fragment                  float64
urgent                          float64
hot                             float64
num_failed_logins               float64
logged_in                      category
num_compromised                 float64
root_shell                      float64
su_attempted                    float64
num_root                        float64
num_file_creations              float64
num_shells                      float64
num_access_files                float64
num_outbound_cmds               float64
is_host_login                  category
is_guest_login                 category
count                           float64
srv_count                       float64
serror_rate                     float64


In [9]:
X = df.drop(["class"],axis = 1)

In [10]:
X.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')

## Encoding Categorical Data

In [11]:
# Protocol type
X['protocol_type'] = df['protocol_type'].astype('category').cat.codes

# Land
X['land'] = pd.Categorical(df['land'], ['0', '1']).codes

# Is logged in
X['logged_in'] = pd.Categorical(df['logged_in'], ['0', '1']).codes

# Host logged in
X['is_host_login'] = pd.Categorical(df['is_host_login'], ['0', '1']).codes

# Guest logged in
X['is_guest_login'] = pd.Categorical(df['is_guest_login'], ['0', '1']).codes

# Class
Y = pd.Categorical(df['class'], ['normal', 'anomaly']).codes


### Count unique values.

In [12]:
df["class"].value_counts()

class
normal     67343
anomaly    58630
Name: count, dtype: int64

### view final trainig set.

In [13]:
X.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.0,1,ftp_data,SF,491.0,0.0,0,0.0,0.0,0.0,...,150.0,25.0,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0
1,0.0,2,other,SF,146.0,0.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0
2,0.0,1,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,26.0,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,1,http,SF,232.0,8153.0,0,0.0,0.0,0.0,...,30.0,255.0,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01
4,0.0,1,http,SF,199.0,420.0,0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.preprocessing import LabelEncoder

le_service = LabelEncoder()
X['service'] = le_service.fit_transform(df['service'])

le_flag = LabelEncoder()
X['flag'] = le_flag.fit_transform(df['flag'])



In [15]:
X.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.0,1,20,9,491.0,0.0,0,0.0,0.0,0.0,...,150.0,25.0,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0
1,0.0,2,44,9,146.0,0.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0
2,0.0,1,49,5,0.0,0.0,0,0.0,0.0,0.0,...,255.0,26.0,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,1,24,9,232.0,8153.0,0,0.0,0.0,0.0,...,30.0,255.0,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01
4,0.0,1,24,9,199.0,420.0,0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Eliminate correlated features.

In [16]:
import pandas as pd
import numpy as np

# Assuming df_cleaned is your DataFrame after removing null-valued columns
# Compute correlation matrix

correlation_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool_))

# Find features with correlation greater than threshold (e.g., 0.95)
threshold = 0.95
correlated_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]

# Drop correlated features
df_uncorrelated = X.drop(columns=correlated_features)

# Check the shape of the DataFrame after dropping correlated features
print("Shape of DataFrame after dropping correlated features:", df_uncorrelated.shape)

Shape of DataFrame after dropping correlated features: (125973, 35)


### uncorrelated collumns

In [17]:
df_uncorrelated.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_file_creations', 'num_shells', 'num_access_files',
       'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
       'srv_count', 'serror_rate', 'rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_rerror_rate'],
      dtype='object')

In [18]:

# Specify the columns to be standardized
columns_to_standardize = df_uncorrelated.columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the specified columns
X = scaler.fit_transform(X[columns_to_standardize])


In [19]:
print(f"X : {type(X)} Y : {type(Y)}")

X : <class 'numpy.ndarray'> Y : <class 'numpy.ndarray'>


## Validation Data Preparation

In [20]:
data_validate = []
with open('/kaggle/input/nslkdd/KDDTrain+_20Percent.arff', "r") as f:
    for line in f:
        line = line.replace('\n', '')
        data_validate.append(line.split(','))


names = ['duration',  'protocol_type', 'service',  'flag',  'src_bytes' , 'dst_bytes' , 'land' , 'wrong_fragment' , 'urgent' , 'hot' , 'num_failed_logins' ,
 'logged_in' , 'num_compromised' , 'root_shell' , 'su_attempted' , 'num_root' , 'num_file_creations' , 'num_shells' , 'num_access_files' , 'num_outbound_cmds' ,
 'is_host_login' , 'is_guest_login' , 'count' , 'srv_count' , 'serror_rate' , 'srv_serror_rate' , 'rerror_rate' , 'srv_rerror_rate' , 'same_srv_rate' ,
 'diff_srv_rate' , 'srv_diff_host_rate' , 'dst_host_count' , 'dst_host_srv_count' , 'dst_host_same_srv_rate' , 'dst_host_diff_srv_rate' , 'dst_host_same_src_port_rate' ,
 'dst_host_srv_diff_host_rate' , 'dst_host_serror_rate', 'dst_host_srv_serror_rate' , 'dst_host_rerror_rate' , 'dst_host_srv_rerror_rate' , 'class' ,]
    
df_validate = pd.DataFrame(data[44:], columns=names)

In [21]:
# Convert columns to appropriate data types
df_validate = df_validate.astype(expected_data_types)

# Preparing validation data

In [22]:
X_val = df.drop(["class"],axis = 1)

### Encodign categoricals

In [23]:
# Protocol type
X_val['protocol_type'] = df_validate['protocol_type'].astype('category').cat.codes

# Land
X_val['land'] = pd.Categorical(df_validate['land'], ['0', '1']).codes

# Is logged in
X_val['logged_in'] = pd.Categorical(df_validate['logged_in'], ['0', '1']).codes

# Host logged in
X_val['is_host_login'] = pd.Categorical(df_validate['is_host_login'], ['0', '1']).codes

# Guest logged in
X_val['is_guest_login'] = pd.Categorical(df_validate['is_guest_login'], ['0', '1']).codes

# Class
Y_val = pd.Categorical(df_validate['class'], ['normal', 'anomaly']).codes

In [24]:
from sklearn.preprocessing import LabelEncoder

le_service = LabelEncoder()
X_val['service'] = le_service.fit_transform(df_validate['service'])

le_flag = LabelEncoder()
X_val['flag'] = le_flag.fit_transform(df_validate['flag'])

### Keeping only uncorrelateds

In [25]:
# Specify the columns to be standardized
columns_to_standardize = df_uncorrelated.columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the specified columns
X_val = scaler.fit_transform(X_val[columns_to_standardize])

In [26]:
type(X_val)

numpy.ndarray

### Building Model with AutoEncoder

In [27]:
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

def create_bilstm_cnn_model(input_shape):
    # Input layer
    input_layer = Input(shape=input_shape)

    # Bi-LSTM branch
    lstm_output = LSTM(64, return_sequences=True)(input_layer)
    lstm_output = Dropout(0.5)(lstm_output)
    lstm_output = LSTM(32)(lstm_output)
    lstm_output = Dropout(0.5)(lstm_output)

    # CNN branch
    cnn_output = Conv1D(filters=64, kernel_size=3, activation='relu')(input_layer)
    cnn_output = MaxPooling1D(pool_size=2)(cnn_output)
    cnn_output = Flatten()(cnn_output)
    cnn_output = Dense(32, activation='relu')(cnn_output)
    cnn_output = Dropout(0.5)(cnn_output)

    # Combine LSTM and CNN branches
    combined = Concatenate()([lstm_output, cnn_output])
    combined = Dense(64, activation='relu')(combined)
    combined = Dropout(0.3)(combined)
    output = Dense(1, activation='sigmoid')(combined)

    model = Model(inputs=input_layer, outputs=output)
    return model

# Reshape training data
# X_resampled_np = X_resampled.to_numpy()
X_train = np.expand_dims(X, axis=2)  # Reshape to (samples, timesteps, features)
n_samples, n_timesteps, n_features = X_train.shape


# Create and compile the model
model = create_bilstm_cnn_model((n_timesteps, n_features))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Define the checkpoint callback to save the best model during training
checkpoint_path = '/kaggle/working/best_model.keras'
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True, mode='max', save_weights_only=False)

# Training the model
history = model.fit(
    X_train, Y,
    epochs=20,
    batch_size=32,
    validation_data=(X_val, Y_val),
    callbacks=[EarlyStopping(patience=5), checkpoint]
)

2024-06-04 17:13:43.866528: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-04 17:13:43.866648: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-04 17:13:44.010646: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/20
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 34ms/step - accuracy: 0.9567 - loss: 0.1271 - val_accuracy: 0.9874 - val_loss: 0.0393
Epoch 2/20
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 34ms/step - accuracy: 0.9812 - loss: 0.0515 - val_accuracy: 0.9898 - val_loss: 0.0291
Epoch 3/20
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 34ms/step - accuracy: 0.9840 - loss: 0.0427 - val_accuracy: 0.9908 - val_loss: 0.0291
Epoch 4/20
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 34ms/step - accuracy: 0.9868 - loss: 0.0386 - val_accuracy: 0.9916 - val_loss: 0.0252
Epoch 5/20
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 35ms/step - accuracy: 0.9875 - loss: 0.0368 - val_accuracy: 0.9925 - val_loss: 0.0231
Epoch 6/20
[1m3937/3937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 35ms/step - accuracy: 0.9884 - loss: 0.0333 - val_accuracy: 0.9908 - val_loss: 0.024

### Evaluation