In [1]:
pip install dask[dataframe]

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.21-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.20-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.18-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.16


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
import lightgbm as lgb
import time
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

# Load CSV files from Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define file paths for training and testing datasets
train_file_path = '/content/drive/My Drive/IDS Dataset 2/NF-ToN-IoT-V2_15000 samples_minmax_normalized_train.csv'
test_file_path = '/content/drive/My Drive/IDS Dataset 2/NF-ToN-IoT-V2_15000 samples_minmax_normalized_test.csv'

# Load the datasets into DataFrames
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Separate features and labels for training and testing datasets
X_train = train_data.iloc[:, :-2]  # All columns except the last two (labels)
y_train = train_data.iloc[:, -1]    # Last column (label names)

X_test = test_data.iloc[:, :-2]     # All columns except the last two (labels)
y_test = test_data.iloc[:, -1]       # Last column (label names)

# Define parameter grid for LightGBM
param_grid_lgb = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'max_depth': [-1, 10, 20]
}

# Initialize LightGBM model
lgb_model = lgb.LGBMClassifier(random_state=42)

# Set up GridSearchCV for LightGBM
grid_search_lgb = GridSearchCV(estimator=lgb_model, param_grid=param_grid_lgb, scoring='f1_weighted', cv=5, n_jobs=-1)

# Fit GridSearchCV on the training data for LightGBM
grid_search_lgb.fit(X_train, y_train)

# Get the best parameters and the best LightGBM model
best_params_lgb = grid_search_lgb.best_params_
best_model_lgb = grid_search_lgb.best_estimator_

# Print best parameters and score for LightGBM
print("Best Parameters for LightGBM:", best_params_lgb)
print("Best Cross-Validation F1 Score for LightGBM:", grid_search_lgb.best_score_)

# Fit the best LightGBM model on the entire training set
best_model_lgb.fit(X_train, y_train)

# Start time
start_time = time.time()

# Make predictions on the test set using LightGBM
y_pred_lgb = best_model_lgb.predict(X_test)

# End time
end_time = time.time()

# Calculate execution time
execution_time = end_time - start_time
print(f"Execution time for lightGBM classifier: {execution_time:.5f} seconds")

# Calculate evaluation metrics on the test set for LightGBM
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
precision_lgb = precision_score(y_test, y_pred_lgb, average='weighted')
recall_lgb = recall_score(y_test, y_pred_lgb, average='weighted')
f1_lgb = f1_score(y_test, y_pred_lgb, average='weighted')

# Print evaluation metrics for LightGBM
print(f'LightGBM Accuracy: {accuracy_lgb:.2f}')
print(f'LightGBM Precision: {precision_lgb:.2f}')
print(f'LightGBM Recall: {recall_lgb:.2f}')
print(f'LightGBM F1 Score: {f1_lgb:.2f}')

# Save the best LightGBM model to a file
joblib.dump(best_model_lgb, 'best_lightgbm_model.pkl')




Mounted at /content/drive
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002834 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4337
[LightGBM] [Info] Number of data points in the train set: 12010, number of used features: 41
[LightGBM] [Info] Start training from score -1.300950
[LightGBM] [Info] Start training from score -6.828546
[LightGBM] [Info] Start training from score -1.987999
[LightGBM] [Info] Start training from score -3.021883
[LightGBM] [Info] Start training from score -2.994900
[LightGBM] [Info] Start training from score -7.784057
[LightGBM] [Info] Start training from score -2.558310
[LightGBM] [Info] Start training from score -8.007201
[LightGBM] [Info] Start training from score -1.484373
[LightGBM] [Info] Start training from score -1.681498
Best Parameters for LightGBM: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimat

['best_lightgbm_model.pkl']

In [3]:
# Import necessary libraries
import os
import joblib
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path for the model and the folder name
model_folder = '/content/drive/My Drive/IDS models'
model_filename = 'best_model_lgb'
model_path = os.path.join(model_folder, model_filename)

# Create the folder if it doesn't exist
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

# Save the model to the specified path
joblib.dump(best_model_lgb, model_path)

print(f'Model saved successfully at: {model_path}')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved successfully at: /content/drive/My Drive/IDS models/best_model_lgb


In [4]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(best_model_lgb, X_train, y_train ,cv = 10, scoring = 'recall_weighted')
avg_scores = scores.mean()
avg_scores



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4299
[LightGBM] [Info] Number of data points in the train set: 10809, number of used features: 41
[LightGBM] [Info] Start training from score -1.300950
[LightGBM] [Info] Start training from score -6.803228
[LightGBM] [Info] Start training from score -1.988337
[LightGBM] [Info] Start training from score -3.020934
[LightGBM] [Info] Start training from score -2.994715
[LightGBM] [Info] Start training from score -7.678696
[LightGBM] [Info] Start training from score -2.558310
[LightGBM] [Info] Start training from score -8.189522
[LightGBM] [Info] Start training from score -1.484291
[LightGBM] [Info] Start training from score -1.681747
[LightGBM] [Info] Auto-choosing row-wise m

0.9461282264779352

In [5]:
from sklearn.model_selection import cross_val_score
scores_test = cross_val_score(best_model_lgb, X_test, y_test ,cv = 10, scoring = 'recall_weighted')
avg_scores_test = scores_test.mean()
avg_scores_test

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3097
[LightGBM] [Info] Number of data points in the train set: 2703, number of used features: 40
[LightGBM] [Info] Start training from score -1.292768
[LightGBM] [Info] Start training from score -6.515823
[LightGBM] [Info] Start training from score -2.041331
[LightGBM] [Info] Start training from score -2.960475
[LightGBM] [Info] Start training from score -3.183619
[LightGBM] [Info] Start training from score -7.902118
[LightGBM] [Info] Start training from score -2.618914
[LightGBM] [Info] Start training from score -7.902118
[LightGBM] [Info] Start training from score -1.434419
[LightGBM] [Info] Start training from score -1.665748
[LightGBM] [Info] Auto-choosing row-wise mu

0.9267386489479513

In [6]:
print("Train score is:",format(avg_scores))
print("Test score is:",format(avg_scores_test))

Train score is: 0.9461282264779352
Test score is: 0.9267386489479513
