In [None]:
!unzip -q MachineLearningCSV.zip -d cic-ids2017
!ls cic-ids2017

In [None]:
import pandas as pd
import os

data_dir = 'cic-ids2017/MachineLearningCVE'
files = sorted(os.listdir(data_dir))

df_list = []
for fname in files:
    path = os.path.join(data_dir, fname)
    df = pd.read_csv(path)
    # Down sample data here if running into kernel crash issues
    df_sample = df.sample(frac = 1, random_state=42)
    df_list.append(df_sample)

full_df = pd.concat(df_list, ignore_index=True)
print('Shape after no down-sampling', full_df.shape)

In [None]:
import pandas as pd 
import numpy as np

full_df.columns = full_df.columns.str.strip()
full_df.replace([np.inf, -np.inf], np.nan, inplace=True)

X = full_df.drop("Label", axis=1)
y = full_df["Label"]

for col in X.select_dtypes(include=["float64"]):
    X[col] = pd.to_numeric(X[col], downcast="float")
for col in X.select_dtypes(include=["int64"]):
    X[col] = pd.to_numeric(X[col], downcast="integer")

for col in X.columns:
    med = X[col].median()
    X[col] = X[col].fillna(med)

full_df = X.copy()
full_df["Label"] = y

print("Post‐impute label distribution:")
print(full_df["Label"].value_counts())


In [None]:
import numpy as np

full_df.columns = full_df.columns.str.strip()

full_df.replace([np.inf, -np.inf], np.nan, inplace=True)

X = full_df.drop('Label', axis=1)
y = full_df['Label']

X_imputed = X.fillna(X.median())

full_df = X_imputed.copy()
full_df['Label'] = y

print('Post-impute label distribution:')
print(full_df['Label'].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

le = LabelEncoder()
y_int = le.fit_transform(full_df['Label'])  

y_cat = to_categorical(y_int)

import joblib
joblib.dump(le, 'label_encoder.pkl')

X = full_df.drop('Label', axis=1).astype('float32')

X_train, X_test, y_train_cat, y_test_cat, y_train_int, y_test_int = train_test_split(
    X, y_cat, y_int,
    test_size=0.2,
    stratify=y_int,
    random_state=42
)

print("Train:", X_train.shape, "Test:", X_test.shape)
print("Class counts (train):")
print(pd.Series(y_train_int).value_counts())


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

import joblib
joblib.dump(scaler, 'scaler.pkl')
print('Scaling complete.')

In [None]:
import tensorflow as tf
from keras import layers, models, callbacks

input_dim = X_train_scaled.shape[1]
num_classes = y_train_cat.shape[1] 

model = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(num_classes, activation='softmax')
])

In [None]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

early_stop = callbacks.EarlyStopping(
    monitor='val_loss', patience=3, restore_best_weights=True
)
ckpt = callbacks.ModelCheckpoint(
    'best_ids_mc_model.h5', save_best_only=True
)

history = model.fit(
    X_train_scaled, y_train_cat,
    epochs=25,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stop, ckpt]
)


In [None]:
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

test_metrics = model.evaluate(X_test_scaled, y_test_cat)
print(dict(zip(model.metrics_names, test_metrics)))

y_pred_prob = model.predict(X_test_scaled)     
y_pred_int  = y_pred_prob.argmax(axis=1)
y_true_int  = y_test_int

le = joblib.load('label_encoder.pkl')
y_pred_labels = le.inverse_transform(y_pred_int)
y_true_labels = le.inverse_transform(y_true_int)

cm = confusion_matrix(y_true_int, y_pred_int)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted'); plt.ylabel('True')
plt.title('Confusion Matrix (Multi-class)')
plt.show()

print(classification_report(y_true_int, y_pred_int,
      target_names=le.classes_))


In [None]:
model.save('trainMC100-25.keras')