In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import shap
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# ✅ Step 1: Reduce Memory Usage
def optimize_memory(df):
    for col in df.select_dtypes(include=["int", "float"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="float")
    return df

# ✅ Step 2: Load CICIDS2017 in Chunks
req_cols = [ ' Packet Length Std', ' Total Length of Bwd Packets', ' Subflow Bwd Bytes',
' Destination Port', ' Packet Length Variance', ' Bwd Packet Length Mean',' Avg Bwd Segment Size',
'Bwd Packet Length Max', ' Init_Win_bytes_backward','Total Length of Fwd Packets',
' Subflow Fwd Bytes', 'Init_Win_bytes_forward', ' Average Packet Size', ' Packet Length Mean',
' Max Packet Length',' Label']

# ✅ Step 2: Load Dataset in Chunks to Prevent MemoryError
chunksize = 100000  
df_list = []

for file in [
    "cicids_db/Wednesday-workingHours.pcap_ISCX.csv",
    "cicids_db/Tuesday-WorkingHours.pcap_ISCX.csv",
    "cicids_db/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "cicids_db/Monday-WorkingHours.pcap_ISCX.csv",
    "cicids_db/Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "cicids_db/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "cicids_db/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
]:
    for chunk in pd.read_csv(file, usecols=req_cols, chunksize=chunksize, low_memory=False):
        chunk = chunk.dropna()
        df_list.append(chunk)
        if len(df_list) >= 3:  # Limit dataset to avoid memory crash
            break

df = pd.concat(df_list, ignore_index=True)

In [5]:
# ✅ Step 3: Data Preprocessing
X = df.drop(columns=[" Label"])  
y = df[" Label"].apply(lambda x: 1 if x != "BENIGN" else 0)  

scaler = StandardScaler()
X = scaler.fit_transform(X)
joblib.dump(scaler, "scaler.pkl")  # Save scaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# ✅ Step 4: Train LightGBM Model (Much Faster than Deep Learning)
model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=10)
model.fit(X_train, y_train)

# ✅ Save Model
joblib.dump(model, "intrusion_lgbm.pkl")
print("✅ Model Training Complete & Saved as 'intrusion_lgbm.pkl'")



[LightGBM] [Info] Number of positive: 243197, number of negative: 476803
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047418 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3816
[LightGBM] [Info] Number of data points in the train set: 720000, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.337774 -> initscore=-0.673232
[LightGBM] [Info] Start training from score -0.673232
✅ Model Training Complete & Saved as 'intrusion_lgbm.pkl'
