<a href="https://colab.research.google.com/github/sivasakthi16/Network-Intrusion-Detection/blob/main/IntrustionHybrid_LSTMCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Essential Modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix
%matplotlib inline

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### About the Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/dataset_sdn.csv')
df.head(10)

### Data Preprocessing

#### Dataset Dimensions

In [None]:
print("This Dataset has {} rows and {} columns".format(df.shape[0], df.shape[1]))

#### Concise summary of dataset

In [None]:
df.info()

#### Descriptive statistics of dataset

In [None]:
df.describe()

#### heatmap of missing values

In [None]:
msno.matrix(df)

#### Count of null values in each feature

In [None]:
df.isnull().sum()

In [None]:
(df.isnull().sum()/df.isnull().count())*100

#### Drop rows with null values

In [None]:
df.dropna(inplace=True)

#### Info after handling Null Values

In [None]:
print(df.isnull().sum())
print("This Dataframe has {} rows and {} columns after removing null values".format(df.shape[0], df.shape[1]))

#### Distribution of Target Class

In [None]:
malign = df[df['label'] == 1]
benign = df[df['label'] == 0]

print('Number of DDOS attacks that has occured :',round((len(malign)/df.shape[0])*100,2),'%')
print('Number of DDOS attacks that has not occured :',round((len(benign)/df.shape[0])*100,2),'%')

#### Barplot of Target Class

In [None]:
# Let's plot the Label class against the Frequency
labels = ['benign','malign']
classes = pd.value_counts(df['label'], sort = True) / df['label'].count() *100
classes.plot(kind = 'bar')
plt.title("Label class distribution")
plt.xticks(range(2), labels)
plt.xlabel("Label")
plt.ylabel("Frequency %")

#### Pairplot of select features

In [None]:
sns.pairplot(df,hue="label",vars=['pktcount','flows','bytecount'])

#### Columns in the dataset

In [None]:
df.columns

#### Unique values in each column

In [None]:
print(df.apply(lambda col: col.unique()))

#### Numerical Features

In [None]:
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
print("The number of numerical features is",len(numerical_features),"and they are : \n",numerical_features)

#### Categorical Features

In [None]:
categorical_features = [feature for feature in df.columns if df[feature].dtypes == 'O']
print("The number of categorical features is",len(categorical_features),"and they are : \n",categorical_features)

#### Number of Unique values in the numerical features

In [None]:
# number of unique values in each numerical variable
df[numerical_features].nunique(axis=0)

#### Discrete numerical features

In [None]:
#discrete numerical features
discrete_feature = [feature for feature in numerical_features if df[feature].nunique()<=15 and feature != 'label']
print("The number of discrete features is",len(discrete_feature),"and they are : \n",discrete_feature)

In [None]:
df[discrete_feature].head(10)

#### Continuous features

In [None]:
continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature + ['label']]
print("The number of continuous_feature features is",len(continuous_feature),"and they are : \n",continuous_feature)

### Exploratory Data Analysis

#### Plotting function definition

In [None]:
def countplot_distribution(col):
    sns.set_theme(style="darkgrid")
    sns.countplot(y=col, data=df).set(title = 'Distribution of ' + col)

def histplot_distribution(col):
    sns.set_theme(style="darkgrid")
    sns.histplot(data=df,x=col, kde=True,color="red").set(title = 'Distribution of ' + col)

#### Visualize the distribution of Categorical features

In [None]:
## Lets analyse the categorical values by creating histograms to understand the distribution
f = plt.figure(figsize=(8,20))
for i in range(len(categorical_features)):
    f.add_subplot(len(categorical_features), 1, i+1)
    countplot_distribution(categorical_features[i])
plt.show()

In [None]:
for i in range(len(categorical_features)):
    g = sns.catplot(data=df,x="tot_dur",y=categorical_features[i],kind="boxen").set(title = categorical_features[i])
    g.fig.set_figheight(7)
    g.fig.set_figwidth(15)

#### Visualize the distribution of continuous features

In [None]:
## Lets analyse the continuous values by creating histograms to understand the distribution
f = plt.figure(figsize=(20,90))
for i in range(len(continuous_feature)):
    f.add_subplot(len(continuous_feature), 2, i+1)
    histplot_distribution(continuous_feature[i])
plt.show()

#### Visualize the distribution of continuous features wrt packet count, protocol and type of attack

In [None]:
## Relplot of log(variable)
import warnings
warnings.filterwarnings("ignore")
for feature in continuous_feature:
    data=df.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        data['pktcount']=np.log(data['pktcount'])
        plt.figure(figsize=(20,20))
        sns.relplot(data=data, x=data[feature],y=data['pktcount'],hue="Protocol",style="Protocol",
                    col="label",kind="scatter").set(title="logarithmic Relplot of feature : " + feature)

#### Visualize the distribution of numerical discrete features

In [None]:
for feature in discrete_feature:
    plt.figure(figsize=(8,4))
    cat_num = df[feature].value_counts()
    sns.barplot(x=cat_num.index, y = cat_num).set(title = "Graph for "+feature, ylabel="Frequency")
    plt.show()

In [None]:
def get_percentage_malign_protocols():
    arr = [x for x, y in zip(df['Protocol'], df['label']) if y == 1]
    perc_arr = []
    for i in ['UDP','TCP','ICMP']:
        perc_arr.append(arr.count(i)/len(arr) *100)
    return perc_arr

#### Distribution of protocols for malign attacks

In [None]:
fig1, ax1 = plt.subplots(figsize=[7,7])
ax1.pie(get_percentage_malign_protocols(), explode=(0.1, 0, 0), autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')
ax1.legend(['UDP', 'TCP', 'ICMP'],loc="best")
plt.title('Distribution of protocols for malign attacks',fontsize = 14)
plt.show()

#### Checking for outliers in Packet count feature

In [None]:
fig, ax = plt.subplots(figsize=[10, 10])
sns.boxplot(
    data=df,
    x='pktcount',
    y='Protocol'
)
ax.set_title('Boxplot, Packet count for different protocols')

#### Heat map of correlation of features

In [None]:
print("Features which need to be encoded are : \n" ,categorical_features)

### Encoding categorical features





In [None]:
df = pd.get_dummies(df, columns=categorical_features,drop_first=True)
print("This Dataframe has {} rows and {} columns after encoding".format(df.shape[0], df.shape[1]))

In [None]:
#dataframe after encoding
df.head(10)

In [None]:
df.dtypes

### Split into Independent and dependent variables

In [None]:
#separating input and output attributes
x = df.drop(['label'], axis=1)
y = df['label']

### Normalizing features

In [None]:
ms = MinMaxScaler()
x = ms.fit_transform(x)

### Train-Test-Split [75-25]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.3)
print(X_train.shape, X_test.shape)

#### Deep Neural Network-LSTM

In [None]:
Model_accuracy = []

## Defining the Deep Neural Network-long short term memory

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Reshape
from tensorflow.keras.optimizers import Adam

model = Sequential()

# Reshaping input (None, 56) to (None, 1, 56) to add a time dimension
model.add(Reshape((1, 56), input_shape=(56,)))

# Adding an LSTM layer as the first layer
model.add(LSTM(64, activation='tanh', return_sequences=False, name="LSTM_Layer"))

# Adding Dense layers
model.add(Dense(28, activation="relu", name="Hidden_Layer_1"))
model.add(Dense(10, activation="relu", name="Hidden_Layer_2"))
model.add(Dense(1, activation="sigmoid", name="Output_Layer"))

# Compile the model with Adam optimizer and binary crossentropy loss
opt = Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss="binary_crossentropy", metrics=['accuracy'])

# Model summary
model.summary()


## Model fitting

In [None]:
# fit model
history_org = model.fit(
    X_train,
    y_train,
    batch_size=32,
    epochs=20, verbose=2,
    callbacks=None,
    validation_data=(X_test,y_test),
    shuffle=True,
    class_weight=None,
    sample_weight=None,
    initial_epoch=0)

## Plotting Loss v/s Epochs

In [None]:
loss = history_org.history['loss']
val_loss = history_org.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'g', label = 'Training Loss')
plt.plot(epochs, val_loss, 'r', label = 'Validation Loss')
plt.title('Loss v/s No. of epochs')
plt.xlabel('Number of Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

## Plotting Accuracy v/s Epochs

In [None]:
loss = history_org.history['accuracy']
val_loss = history_org.history['val_accuracy']
plt.plot(epochs, loss, 'g', label = 'Training accuracy')
plt.plot(epochs, val_loss, 'r', label = 'Validation accuracy')
plt.title('Accuracy Scores v/s Number of Epochs')
plt.xlabel('No. of Epochs')
plt.ylabel('Accuracy Score')
plt.legend()
plt.show()

## Model Evaluation

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Accuracy of Long short term memory : %.2f' % (accuracy*100))
Model_accuracy.append(accuracy*100)

In [None]:
!pip install keras-tuner

In [None]:
import keras_tuner as kt

# Define a function to tune the LSTM architecture
def build_tuned_lstm_model(hp):
    model = Sequential()
    units = hp.Int('units', min_value=32, max_value=128, step=32)
    model.add(LSTM(units, input_shape=(10, 56), activation='tanh', return_sequences=False))
    model.add(Dense(32, activation='relu'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create a tuner to search for the best LSTM model hyperparameters
tuner = kt.RandomSearch(
    build_tuned_lstm_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='lstm_tuning',
    project_name='lstm_knn_hybrid'
)


In [None]:
def model_builder(hp):
    model = keras.Sequential()

    model.add(Dense(28 , input_shape=(56,) , activation="relu" , name="Hidden_Layer_1"))
    model.add(Dense(10 , activation="relu" , name="Hidden_Layer_2"))
    model.add(Dense(1 , activation="sigmoid" , name="Output_Layer"))
    opt = keras.optimizers.Adam(learning_rate=0.01)

    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate',[1e-2, 1e-3, 1e-4])), loss='binary_crossentropy', metrics=['accuracy'])

    return history, model.layers, model

In [None]:
classes = model.predict(X_test)
print(classes)

In [None]:
y_pred = []
for i in classes:
    if i > 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

In [None]:
y_pred[:20]

In [None]:
y_test[:20]

## Classification Report

In [None]:
print(classification_report(y_test, y_pred, target_names = labels))

## Plotting Confusion Matrix

In [None]:
from itertools import product
def plot_confusion_matrix(cm, classes, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.figure(figsize=(10,10))
    plt.grid(False)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    cm1 = cm
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = np.around(cm, decimals=2)
        cm[np.isnan(cm)]
        thresh = cm.max() / 2.
    for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, str(cm1[i, j])+ " ("+ str(cm[i, j]*100)+"%)",
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
confusion_mtx = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(confusion_mtx, classes = labels)

##LSTM-CNN HYBRID MODEL

>



In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
import tensorflow as tf

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Now proceed with model definition and training


# Define the CNN + LSTM model with balanced dropout and L2 regularization
def create_cnn_lstm_model(input_shape):
    model = Sequential()

    # CNN layers for feature extraction
    model.add(Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=input_shape))
    model.add(Dropout(0.3))  # Moderate dropout
    model.add(BatchNormalization())

    # LSTM layers for temporal pattern recognition
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.3))  # Moderate dropout
    model.add(LSTM(64, return_sequences=False))
    model.add(BatchNormalization())

    # Dense layers with L2 regularization
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))  # Add L2 regularization
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.001)))  # Binary classification

    return model



# Define input shape with timesteps=1 and n_features=56
timesteps = 1
n_features = 56
input_shape = (timesteps, n_features)

# Reshape data to add a time dimension (timesteps=1)
X_train_reshaped = X_train.reshape((X_train.shape[0], timesteps, n_features))
X_test_reshaped = X_test.reshape((X_test.shape[0], timesteps, n_features))


# Compile and train the CNN + LSTM hybrid model
cnn_lstm_model = create_cnn_lstm_model(input_shape)
cnn_lstm_model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])


# Print model summary to verify CNN + LSTM layers
print("CNN + LSTM Model Architecture:")
cnn_lstm_model.summary()


# Early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)

# Train model with adjusted batch size and callbacks
history_org = cnn_lstm_model.fit(
    X_train_reshaped, y_train,
    epochs=100,
    batch_size=64,  # Increased batch size back to 64
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stopping, reduce_lr]
)

# Step 7: Evaluate the CNN + LSTM hybrid model
y_test_pred = cnn_lstm_model.predict(X_test_reshaped)
y_test_pred = (y_test_pred > 0.5).astype(int)  # Convert probabilities to binary output

# Evaluate metrics
cnn_lstm_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy of CNN + LSTM Hybrid Model: {cnn_lstm_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

##Plotting Loss vs Epoch & Accuracy vs Epoch

In [None]:
# Extract loss and accuracy values from history
loss = history_org.history['loss']
val_loss = history_org.history['val_loss']
accuracy = history_org.history['accuracy']
val_accuracy = history_org.history['val_accuracy']
epochs = range(1, len(loss) + 1)

# Plot Loss vs Epoch
plt.figure(figsize=(10, 5))
plt.plot(epochs, loss, 'g', label='Training Loss')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.title('Loss vs Number of Epochs')
plt.xlabel('Number of Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

# Plot Accuracy vs Epoch
plt.figure(figsize=(10, 5))
plt.plot(epochs, accuracy, 'g', label='Training Accuracy')
plt.plot(epochs, val_accuracy, 'r', label='Validation Accuracy')
plt.title('Accuracy vs Number of Epochs')
plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from itertools import product

# Define the plot_confusion_matrix function (as you've provided it)
def plot_confusion_matrix(cm, classes, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.figure(figsize=(10, 10))
    plt.grid(False)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    cm1 = cm
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = np.around(cm, decimals=2)
        cm[np.isnan(cm)] = 0  # Handle NaNs
        thresh = cm.max() / 2.
    for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, f"{cm1[i, j]} ({cm[i, j] * 100:.2f}%)",
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Make predictions using your model
y_test_pred = cnn_lstm_model.predict(X_test_reshaped)
y_test_pred = (y_test_pred > 0.5).astype(int)  # Convert probabilities to binary output

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
classes = ["Class 0", "Class 1"]  # Adjust this to match your class labels

# Plot the confusion matrix
plot_confusion_matrix(cm, classes=classes, normalize=True, title="Confusion Matrix for LSTM+CNN Model")
plt.show()


In [None]:
# Delete all elements in Model_accuracy after the 0th index
del Model_accuracy[1:]


In [None]:

Model_accuracy.append(cnn_lstm_accuracy*100)
print("Model_accuracy",Model_accuracy)

In [None]:
Model_names = ["DNN-LSTM", "LSTM-CNN(HYBRID)"]

In [None]:
print(f"Length of Model_names: {len(Model_names)}")
print(f"Length of Model_accuracy: {len(Model_accuracy)}")


In [None]:
df_clf = pd.DataFrame()
df_clf['name'] = Model_names
df_clf['Accuracy'] = Model_accuracy
df_clf = df_clf.sort_values(by=['Accuracy'], ascending=False)
df_clf.head(10)

In [None]:
import matplotlib.pyplot as plt

# Define model names and accuracies
Model_names = ["DNN-LSTM", "LSTM-CNN(HYBRID)"]
# Assuming Model_accuracy has two elements with the accuracies
Model_accuracy = Model_accuracy[:2]  # Ensure it only contains two entries

# Plotting the bar chart
plt.figure(figsize=(8, 6))
plt.bar(Model_names, Model_accuracy, color=['blue', 'orange'])
plt.xlabel("Model")
plt.ylabel("Accuracy (%)")
plt.title("Model Accuracy Comparison")
plt.ylim(0, 100)  # Set y-axis limit to 100 for easier comparison
for i, v in enumerate(Model_accuracy):
    plt.text(i, v + 1, f"{v:.2f}%", ha='center', fontweight='bold')  # Label each bar with its accuracy

plt.show()


In [None]:
# Find the index of the model with the highest accuracy
best_index = Model_accuracy.index(max(Model_accuracy))

# Get the name and accuracy of the best model
best_model_name = Model_names[best_index]
best_accuracy = Model_accuracy[best_index]

# Print the result
print(f"The model with the highest accuracy is {best_model_name} with an accuracy of {best_accuracy:.2f}%")
