In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Display all columns
pd.set_option("display.max_columns", None)

# Load cleaned dataset
df = pd.read_csv("../data/cleaned_cicids2017.csv")
print("Dataset loaded. Shape:", df.shape)


Dataset loaded. Shape: (2827876, 80)


In [2]:
# Features: all columns except 'Label' and 'LabelEncoded'
X = df.drop(columns=["Label", "LabelEncoded"])

# Target: encoded labels
y = df["LabelEncoded"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (2827876, 78)
Target shape: (2827876,)


In [3]:
# Select numeric columns
numeric_cols = X.select_dtypes(include=[np.number]).columns

scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

print("Numeric features scaled ✅")


Numeric features scaled ✅


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (2262300, 78)
Test shape: (565576, 78)
