In [40]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, Input, LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [42]:
df = pd.read_csv("/content/drive/My Drive/projet/dataset_APT.csv")

df1 = df.drop(['Flow ID', 'Src IP', 'Dst IP', 'Timestamp','Activity'], axis=1)

print("Avant modification", df1['Stage'].value_counts(), np.unique(df1['Stage']))

df1['Stage'] = df1['Stage'].replace('BENIGN', 'Benign')

df1 = df1[df1['Stage'] != 'Data Exfiltration']

print("Apres modification", df1['Stage'].value_counts(), np.unique(df1['Stage']))

categorical_cols = ['Protocol','Stage']
label_encoder = LabelEncoder()
df1[categorical_cols] = df1[categorical_cols].apply(label_encoder.fit_transform)

null_counts = df1.isnull().sum()
columns_with_null = null_counts[null_counts > 0].index

#print("Columns with null values:")
#print(null_counts[null_counts > 0])
#print("Dataset shape after dropping null values:", df1.shape)

#Handel missing value
df1.dropna(inplace=True)


Avant modification Benign                44258
BENIGN                19454
Reconnaissance        11909
Establish Foothold     8604
Lateral Movement       2451
Data Exfiltration        15
Name: Stage, dtype: int64 ['BENIGN' 'Benign' 'Data Exfiltration' 'Establish Foothold'
 'Lateral Movement' 'Reconnaissance']
Apres modification Benign                63712
Reconnaissance        11909
Establish Foothold     8604
Lateral Movement       2451
Name: Stage, dtype: int64 ['Benign' 'Establish Foothold' 'Lateral Movement' 'Reconnaissance']


In [43]:
df1

Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Stage
0,0,0,0,119998944,242,1,0.0,0.0,0.0,0.0,...,0,0.000000e+00,0.000000,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0
1,53569,8662,2,109235816,21,1,1072.0,52.0,56.0,48.0,...,0,8.195355e+05,78517.844090,875056.0,764015.0,1.517532e+07,6.349189e+06,20019201.0,5202524.0,0
2,68,67,2,119764062,88,1,25515.0,289.0,296.0,288.0,...,0,0.000000e+00,0.000000,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0
3,40504,9200,1,117030424,18,17,23499.0,3736.0,4096.0,0.0,...,0,1.923293e+05,436593.123269,1083374.0,7236.0,1.931131e+07,1.177830e+07,34978598.0,5147962.0,0
4,0,0,0,119999703,2,1,0.0,0.0,0.0,0.0,...,0,0.000000e+00,0.000000,0.0,0.0,5.999985e+07,8.478210e+02,60000451.0,59999252.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86686,53452,58425,1,15048093,4,1,0.0,0.0,0.0,0.0,...,0,7.016021e+06,0.000000,7016021.0,7016021.0,8.032072e+06,0.000000e+00,8032072.0,8032072.0,0
86687,54174,48532,1,15058240,4,1,0.0,0.0,0.0,0.0,...,0,7.028311e+06,0.000000,7028311.0,7028311.0,8.029929e+06,0.000000e+00,8029929.0,8029929.0,0
86688,35168,44591,1,15039436,4,1,0.0,0.0,0.0,0.0,...,0,7.019672e+06,0.000000,7019672.0,7019672.0,8.019764e+06,0.000000e+00,8019764.0,8019764.0,0
86689,51298,22351,1,15054694,4,1,0.0,0.0,0.0,0.0,...,0,7.031059e+06,0.000000,7031059.0,7031059.0,8.023635e+06,0.000000e+00,8023635.0,8023635.0,0


In [44]:
df1.dtypes

Src Port              int64
Dst Port              int64
Protocol              int64
Flow Duration         int64
Total Fwd Packet      int64
                     ...   
Idle Mean           float64
Idle Std            float64
Idle Max            float64
Idle Min            float64
Stage                 int64
Length: 80, dtype: object

In [45]:
df1

Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Stage
0,0,0,0,119998944,242,1,0.0,0.0,0.0,0.0,...,0,0.000000e+00,0.000000,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0
1,53569,8662,2,109235816,21,1,1072.0,52.0,56.0,48.0,...,0,8.195355e+05,78517.844090,875056.0,764015.0,1.517532e+07,6.349189e+06,20019201.0,5202524.0,0
2,68,67,2,119764062,88,1,25515.0,289.0,296.0,288.0,...,0,0.000000e+00,0.000000,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0
3,40504,9200,1,117030424,18,17,23499.0,3736.0,4096.0,0.0,...,0,1.923293e+05,436593.123269,1083374.0,7236.0,1.931131e+07,1.177830e+07,34978598.0,5147962.0,0
4,0,0,0,119999703,2,1,0.0,0.0,0.0,0.0,...,0,0.000000e+00,0.000000,0.0,0.0,5.999985e+07,8.478210e+02,60000451.0,59999252.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86686,53452,58425,1,15048093,4,1,0.0,0.0,0.0,0.0,...,0,7.016021e+06,0.000000,7016021.0,7016021.0,8.032072e+06,0.000000e+00,8032072.0,8032072.0,0
86687,54174,48532,1,15058240,4,1,0.0,0.0,0.0,0.0,...,0,7.028311e+06,0.000000,7028311.0,7028311.0,8.029929e+06,0.000000e+00,8029929.0,8029929.0,0
86688,35168,44591,1,15039436,4,1,0.0,0.0,0.0,0.0,...,0,7.019672e+06,0.000000,7019672.0,7019672.0,8.019764e+06,0.000000e+00,8019764.0,8019764.0,0
86689,51298,22351,1,15054694,4,1,0.0,0.0,0.0,0.0,...,0,7.031059e+06,0.000000,7031059.0,7031059.0,8.023635e+06,0.000000e+00,8023635.0,8023635.0,0


In [46]:
df["Timestamp"]

0        15/07/2019 01:55:21 PM
1        15/07/2019 01:55:22 PM
2        15/07/2019 01:55:22 PM
3        15/07/2019 01:55:23 PM
4        15/07/2019 01:55:58 PM
                  ...          
86686    19/07/2019 05:07:59 PM
86687    19/07/2019 07:04:08 PM
86688    19/07/2019 08:32:15 PM
86689    19/07/2019 07:12:49 PM
86690    19/07/2019 05:49:41 PM
Name: Timestamp, Length: 86691, dtype: object

In [47]:

# Convert the "Timestamp" column to a proper datetime format
df1["Timestamp"] = pd.to_datetime(df["Timestamp"], format="%d/%m/%Y %I:%M:%S %p", dayfirst=True)

# Sort the DataFrame by timestamp in ascending order
df1 = df1.sort_values(by='Timestamp')

# Define the target variable (assuming the target column name is 'Stage')
target = df1["Stage"]

# Select only numeric columns for correlation analysis
numeric_columns = df1.select_dtypes(include=['float64', 'int64', 'int32']).columns

# Calculate the correlation matrix
correlation_matrix = df1[numeric_columns].corr()

# Find the absolute correlation values with the target variable
correlation_with_target = correlation_matrix['Stage'].abs().sort_values(ascending=False)

# Set a threshold for correlation significance (e.g., 0.2)
threshold = 0.1

# Select relevant columns based on the correlation threshold
relevant_columns = correlation_with_target[correlation_with_target >= threshold].index


# Create a new DataFrame with only the relevant columns and the target
selected_data = df1[relevant_columns.union(['Stage'])]
df1 = selected_data

# Standardize numerical columns
numerical_cols = df1.columns.drop('Stage')
scaler = StandardScaler()
df1[numerical_cols] = scaler.fit_transform(df1[numerical_cols])

# Define X train and Y
Y = df1['Stage']
X = df1[df1.columns.drop('Stage')]

# Convert the DataFrame to a numpy array
X = X.to_numpy()
Y = Y.to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[numerical_cols] = scaler.fit_transform(df1[numerical_cols])


In [48]:
df

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Activity,Stage
0,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0,8.0.6.4,0,0,15/07/2019 01:55:21 PM,119998944,242,1,...,0.000000e+00,0.000000,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,Normal,Benign
1,192.168.3.10-239.2.11.71-53569-8662-17,192.168.3.10,53569,239.2.11.71,8662,17,15/07/2019 01:55:22 PM,109235816,21,1,...,8.195355e+05,78517.844090,875056.0,764015.0,1.517532e+07,6.349189e+06,20019201.0,5202524.0,Normal,Benign
2,255.255.255.255-0.0.0.0-67-68-17,0.0.0.0,68,255.255.255.255,67,17,15/07/2019 01:55:22 PM,119764062,88,1,...,0.000000e+00,0.000000,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,Normal,Benign
3,192.168.3.30-192.168.3.31-40504-9200-6,192.168.3.30,40504,192.168.3.31,9200,6,15/07/2019 01:55:23 PM,117030424,18,17,...,1.923293e+05,436593.123269,1083374.0,7236.0,1.931131e+07,1.177830e+07,34978598.0,5147962.0,Normal,Benign
4,0.87.248.248-3.0.0.0-0-0-0,0.87.248.248,0,3.0.0.0,0,0,15/07/2019 01:55:58 PM,119999703,2,1,...,0.000000e+00,0.000000,0.0,0.0,5.999985e+07,8.478210e+02,60000451.0,59999252.0,Normal,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86686,192.168.3.30-192.168.101.125-53452-58425-6,192.168.3.30,53452,192.168.101.125,58425,6,19/07/2019 05:07:59 PM,15048093,4,1,...,7.016021e+06,0.000000,7016021.0,7016021.0,8.032072e+06,0.000000e+00,8032072.0,8032072.0,BENIGN,BENIGN
86687,192.168.3.30-192.168.101.125-54174-48532-6,192.168.3.30,54174,192.168.101.125,48532,6,19/07/2019 07:04:08 PM,15058240,4,1,...,7.028311e+06,0.000000,7028311.0,7028311.0,8.029929e+06,0.000000e+00,8029929.0,8029929.0,BENIGN,BENIGN
86688,192.168.3.30-192.168.101.125-35168-44591-6,192.168.3.30,35168,192.168.101.125,44591,6,19/07/2019 08:32:15 PM,15039436,4,1,...,7.019672e+06,0.000000,7019672.0,7019672.0,8.019764e+06,0.000000e+00,8019764.0,8019764.0,BENIGN,BENIGN
86689,192.168.3.30-192.168.101.125-51298-22351-6,192.168.3.30,51298,192.168.101.125,22351,6,19/07/2019 07:12:49 PM,15054694,4,1,...,7.031059e+06,0.000000,7031059.0,7031059.0,8.023635e+06,0.000000e+00,8023635.0,8023635.0,BENIGN,BENIGN


In [49]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86676 entries, 3404 to 86182
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ACK Flag Count         86676 non-null  float64
 1   Active Max             86676 non-null  float64
 2   Active Mean            86676 non-null  float64
 3   Active Std             86676 non-null  float64
 4   Bwd IAT Max            86676 non-null  float64
 5   Bwd IAT Mean           86676 non-null  float64
 6   Bwd IAT Std            86676 non-null  float64
 7   Bwd IAT Total          86676 non-null  float64
 8   Bwd PSH Flags          86676 non-null  float64
 9   Bwd Packet Length Min  86676 non-null  float64
 10  Bwd Packets/s          86676 non-null  float64
 11  Flow Duration          86676 non-null  float64
 12  Flow IAT Max           86676 non-null  float64
 13  Flow IAT Mean          86676 non-null  float64
 14  Flow IAT Std           86676 non-null  float64
 15 

In [50]:
# Reshape the input data to create sequences for each sample
def create_sequences(data, target, time_steps=1):
    X_sequences, y_sequences = [], []
    for i in range(len(data) - time_steps + 1):
        X_sequences.append(data[i:i + time_steps])
        y_sequences.append(target[i + time_steps - 1])
    return np.array(X_sequences), np.array(y_sequences)

time_steps = 20  # Adjust the value of time_steps based on the sequence length you want to consider
X_seq, Y_seq = create_sequences(X, Y, time_steps)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_seq, Y_seq, test_size=0.2, random_state=42)

# Number of classes
nbr_class = len(np.unique(Y))  # unique labels
feat = X.shape[1]
feat, nbr_class

(31, 4)

In [51]:
# Define the LSTM-based model
class LSTMModel():
    def __init__(self):
        self.model = Sequential()

    def build(self, shape, labels):
        self.model.add(LSTM(64, input_shape=(shape[1], shape[2])))
        self.model.add(Dense(32, activation='relu'))
        self.model.add(Dense(labels, activation='softmax'))
        return self.model

lstm_model = LSTMModel()
global_model = lstm_model.build(X_train.shape, nbr_class)

global_model.compile(optimizer= 'adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
global_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 64)                24576     
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 4)                 132       
                                                                 
Total params: 26,788
Trainable params: 26,788
Non-trainable params: 0
_________________________________________________________________


In [52]:
# Train the model
#global_model.fit(X_train, y_train, batch_size=128, epochs=50, validation_data=(X_test, y_test), callbacks=[EarlyStopping(patience=5)])
global_model.fit(X_train, y_train, batch_size=32, epochs=50)

# Evaluate the model
loss, accuracy = global_model.evaluate(X_test, y_test)
print("Test accuracy:", accuracy)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.9871336221694946


In [53]:
def save_model(model):
    model.save('/content/drive/My Drive/projet/Global_mod_SEA_LSTM_FED_ADV.h5')
    print('Model saved x')

In [54]:
save_model(global_model)

Model saved x
