### Importing necessary libraries

In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report


import warnings
warnings.filterwarnings('ignore')


### Loading  the data

In [15]:
# Load the dataset
data = pd.read_excel('data/network_intrusion_data.xlsx')

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
display(data.head())

# Check for missing values
print("\nMissing values in the dataset:")
print(data.isnull().sum())

First 5 rows of the dataset:


Unnamed: 0,source_ip,destination_ip,packet_count,packet_size,protocol,timestamp,attack_type
0,164.57.12.140,178.210.174.75,112,1093,TCP,2024-01-01 00:00:00,DDoS
1,63.114.71.52,57.227.236.231,445,742,UDP,2024-01-01 00:00:01,normal
2,174.44.216.16,34.220.204.165,870,299,TCP,2024-01-01 00:00:02,normal
3,8.47.111.119,105.96.99.239,280,1268,UDP,2024-01-01 00:00:03,DDoS
4,130.13.101.214,1.57.142.31,116,1122,UDP,2024-01-01 00:00:04,normal



Missing values in the dataset:
source_ip         0
destination_ip    0
packet_count      0
packet_size       0
protocol          0
timestamp         0
attack_type       0
dtype: int64


##### Function to convert IP addresses to integers

In [16]:
# Function to convert IP addresses to integers
def ip_to_int(ip):
    return sum(int(octet) * (256 ** i) for i, octet in enumerate(ip.split('.')[::-1]))

# Apply the function to source and destination IP columns
data["source_ip"] = data["source_ip"].apply(ip_to_int)
data["destination_ip"] = data["destination_ip"].apply(ip_to_int)

# Drop the timestamp column as it's not needed for the model
data.drop('timestamp', axis=1, inplace=True)

# Display the updated dataset
print("\nDataset after converting IP addresses to integers and droping the timestamp:")
display(data.head())


Dataset after converting IP addresses to integers and droping the timestamp:


Unnamed: 0,source_ip,destination_ip,packet_count,packet_size,protocol,attack_type
0,2755202188,3000151627,112,1093,TCP,DDoS
1,1064453940,971238631,445,742,UDP,normal
2,2922174480,584895653,870,299,TCP,normal
3,137326455,1767924719,280,1268,UDP,DDoS
4,2181916118,20549151,116,1122,UDP,normal


#### Encode the 'protocol' column using OneHotEncoder

In [17]:
# Encode the 'protocol' column using OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('protocol', OneHotEncoder(handle_unknown='ignore'), ['protocol'])
    ],
    remainder='passthrough'
)

# Encode the target column 'attack_type' using LabelEncoder
le = LabelEncoder()
data["attack_type"] = le.fit_transform(data["attack_type"])

In [18]:
# Split features (X) and target (y)
y = data["attack_type"]
X = data.drop("attack_type", axis=1)

# Preprocess the features
X_preprocessed = preprocessor.fit_transform(X)

# Convert X_preprocessed to a Pandas DataFrame
if hasattr(preprocessor, 'get_feature_names_out'):
    # If using a ColumnTransformer with get_feature_names_out (scikit-learn >= 1.0)
    feature_names = preprocessor.get_feature_names_out()
else:
    # Fallback for older versions of scikit-learn
    feature_names = [f'feature_{i}' for i in range(X_preprocessed.shape[1])]

X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_df, y, test_size=0.3, random_state=42)

# Ensure y_train and y_test are DataFrames
y_train = pd.DataFrame(y_train, columns=['attack_type'])
y_test = pd.DataFrame(y_test, columns=['attack_type'])

# Display the structure of X_train and y_train
print("\nStructure of X_train:")
display(X_train.head())
print("\nStructure of y_train:")
display(y_train.head())


Structure of X_train:


Unnamed: 0,protocol__protocol_ICMP,protocol__protocol_TCP,protocol__protocol_UDP,remainder__source_ip,remainder__destination_ip,remainder__packet_count,remainder__packet_size
1840,0.0,1.0,0.0,4209796000.0,2758020000.0,410.0,1260.0
2115,0.0,1.0,0.0,2402962000.0,2993340000.0,515.0,817.0
4437,1.0,0.0,0.0,2343426000.0,202178400.0,766.0,1006.0
1146,0.0,0.0,1.0,2810283000.0,611096800.0,204.0,292.0
2486,0.0,1.0,0.0,4178065000.0,503356000.0,125.0,468.0



Structure of y_train:


Unnamed: 0,attack_type
1840,2
2115,0
4437,1
1146,2
2486,2


### Initialize the Random Forest Classifier

In [19]:
# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
clf.fit(X_train, y_train.values.ravel())

print("\nRandom Forest model trained successfully.")


Random Forest model trained successfully.


### Evaluate model 

In [20]:
# Evaluate model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")  # Print accuracy


print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.75

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       155
           1       0.00      0.00      0.00       148
           2       0.75      0.99      0.85      1126
           3       0.00      0.00      0.00        71

    accuracy                           0.75      1500
   macro avg       0.19      0.25      0.21      1500
weighted avg       0.56      0.75      0.64      1500



### Set the detection threshold (e.g., 0.7).

In [21]:
threshold = 0.7

# Predict probabilities on test data
y_prob = clf.predict_proba(X_test)

# Apply the threshold: predict "attack" only if the maximum probability exceeds the threshold
y_pred_threshold = [1 if max(probs) >= threshold else 0 for probs in y_prob]

# Evaluate accuracy with the threshold
accuracy = accuracy_score(y_test, y_pred_threshold)
print(f"Accuracy with threshold {threshold} : {accuracy:.2f}")

# Display a classification report
print("\nClassification report with threshold :")
print(classification_report(y_test, y_pred_threshold))


Accuracy with threshold 0.7 : 0.10

Classification report with threshold :
              precision    recall  f1-score   support

           0       0.10      0.34      0.15       155
           1       0.10      0.62      0.17       148
           2       0.00      0.00      0.00      1126
           3       0.00      0.00      0.00        71

    accuracy                           0.10      1500
   macro avg       0.05      0.24      0.08      1500
weighted avg       0.02      0.10      0.03      1500



#### Load new data for prediction

In [22]:
# Load new data for prediction
test_data = pd.read_csv('data/new_connections.csv')

# Display the first few rows of the new data
# print("\nFirst 5 rows of the new data:")
# display(test_data.head())

# Preprocess the new data
test_data["source_ip"] = test_data["source_ip"].apply(ip_to_int)
test_data["destination_ip"] = test_data["destination_ip"].apply(ip_to_int)

# Preprocess the features using the same preprocessor
X_test_new = preprocessor.transform(test_data.drop("attack_type", axis=1, errors='ignore'))

# Convert X_test_new to a DataFrame
X_test_new_df = pd.DataFrame(X_test_new, columns=feature_names)

#### Make predictions on the new data

In [23]:
# Make predictions on the new data
predictions = clf.predict(X_test_new_df)

# Add predictions to the new data
test_data["predicted_attack_type"] = le.inverse_transform(predictions)

# Display the results
print("\nPredictions on new data:")
display(test_data.head())


Predictions on new data:


Unnamed: 0,source_ip,destination_ip,packet_count,packet_size,protocol,timestamp,predicted_attack_type
0,1852891731,348684290,697,820,ICMP,2024-02-11 00:00:00,normal
1,4060636899,2561575641,732,523,ICMP,2024-02-11 00:00:01,normal
2,983007346,3858594585,552,423,UDP,2024-02-11 00:00:02,normal
3,4257266135,3266698047,756,209,ICMP,2024-02-11 00:00:03,DDoS
4,2080434995,257075564,251,1060,TCP,2024-02-11 00:00:04,normal


#### Set the detection threshold on the new data


In [24]:
# Définir le seuil de détection
threshold = 0.7

# Prédire les probabilités sur les nouvelles données
y_prob = clf.predict_proba(X_test_new_df)

# Appliquer le seuil : prédire "attaque" (1) si la probabilité max dépasse le seuil, sinon "normal" (0)
y_pred_threshold = [1 if max(probs) >= threshold else 0 for probs in y_prob]
# Sinon, afficher les prédictions avec seuil
test_data["predicted_attack_type_threshold"] = y_pred_threshold
print("\nPredictions with threshold on the new data: :")
test_data.head()


Predictions with threshold on the new data: :


Unnamed: 0,source_ip,destination_ip,packet_count,packet_size,protocol,timestamp,predicted_attack_type,predicted_attack_type_threshold
0,1852891731,348684290,697,820,ICMP,2024-02-11 00:00:00,normal,0
1,4060636899,2561575641,732,523,ICMP,2024-02-11 00:00:01,normal,0
2,983007346,3858594585,552,423,UDP,2024-02-11 00:00:02,normal,0
3,4257266135,3266698047,756,209,ICMP,2024-02-11 00:00:03,DDoS,0
4,2080434995,257075564,251,1060,TCP,2024-02-11 00:00:04,normal,0


###  Déploiement et Surveillance

1. Pour le déploiement et la surveillance du modèle, plusieurs aspects sont à considérer. Dans un SOC (Security Operations Center), le modèle peut être intégré pour analyser les données réseau en temps réel. Par exemple, il pourrait être connecté à des outils qui capturent le trafic, comme des sondes réseau, afin de détecter automatiquement les menaces et d’aider les analystes à réagir rapidement.

2. Ensuite, pour limiter les faux positifs qui risquent de submerger l’équipe, il est utile de définir un seuil de détection. Avec le modèle Random Forest, on peut utiliser les probabilités qu’il génère pour chaque prédiction : si la probabilité d’une attaque dépasse, disons, **0.7**, une alerte est déclenchée. Cela permet de filtrer les événements moins probables et de se concentrer sur les plus suspects.

3. Le modèle doit aussi être testé dans des conditions réelles pour s’assurer qu’il fonctionne bien. On peut simuler des attaques comme une attaque DDoS, avec un gros volume de paquets, une reconnaissance réseau, plus discrète avec des scans, ou une exfiltration de données, avec des transferts inhabituels. Ces tests montrent si le modèle repère bien ces scénarios.

4. Enfin, une intégration avec un SIEM, comme Splunk ou ELK, rendrait le système encore plus efficace. Le modèle enverrait ses alertes directement à ces outils, qui les afficheraient dans des journaux ou des tableaux de bord. Ainsi, les analystes recevraient des notifications automatiques et pourraient agir sans perdre de temps.

### Analyse et Conclusion
1. Les facteurs clés sont le protocole (*protocol*) et le nombre de paquets (*packet_count*). Le protocole révèle le type d’attaque, et un packet_count anormal signale un trafic suspect, comme un DDoS ou une reconnaissance.

2. Le supervisé détecte les attaques connues grâce à des données étiquetées, tandis que le non supervisé repère les anomalies nouvelles (zero-day) sans étiquettes. Les combiner renforce la détection.

3. Ajuster un seuil élevé (ex. : 0.7) réduit les alertes inutiles. Corréler les signaux suspects et filtrer les répétitions allège la charge des analystes.
