IMPORTANT NOTE: The following code and explanation was generated via ChatGPT o1-preview model.

Step 2: Import Necessary Libraries

In your Jupyter Notebook, start by importing the libraries we'll need.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# Add any other imports you may have


Explanation:
    pandas: For data manipulation and analysis.
    numpy: For numerical computations.
    os: For interacting with the operating system.
    matplotlib.pyplot and seaborn: For data visualization.
    sklearn: For machine learning algorithms and evaluation metrics.

Step 3: Load the CSV Files into DataFrames
3.1 Get a List of CSV Files


We'll use the os library to get all CSV files in the dataset folder.

Explanation:
    glob.glob() retrieves all files matching the pattern.
    This code lists all CSV files we will load.

3.2 Load the CSV Files

Now, we'll load each CSV file into a DataFrame and store them in a list.

Explanation:
    We read each CSV file and append it to the dataframes list.
    encoding='latin1' is used to handle any special characters.

Step 4: Combine the DataFrames

We'll concatenate all DataFrames into one large DataFrame.

Explanation:
    pd.concat() combines all DataFrames vertically.
    ignore_index=True resets the index.

In [None]:
import glob
import os
import pandas as pd

# Path to your dataset folder
data_path = 'dataset/'

# Get a list of all CSV files in the dataset folder
csv_files = glob.glob(os.path.join(data_path, '*.csv'))

dataframes = []

for file in csv_files:
    df = pd.read_csv(file, encoding='utf-8')  # Adjust encoding if necessary
    # Strip whitespace from column names
    df.columns = df.columns.str.strip()
    dataframes.append(df)
    print(f"Loaded {file} with shape {df.shape}")

# Combine all DataFrames
data = pd.concat(dataframes, ignore_index=True)
print(f"Combined DataFrame shape: {data.shape}")

In [3]:
# Replace the unidentified character '�' with a hyphen '-' in the 'Label' column
data['Label'] = data['Label'].str.replace('�', '-', regex=False)

# Optionally, strip any leading/trailing whitespace from the 'Label' column
data['Label'] = data['Label'].str.strip()


Step 5: Explore the Data
    5.1 View the First Few Rows

Explanation:
    Displays the first five rows of the DataFrame.

In [None]:
data.head()

5.2 Get DataFrame Information
Explanation:
    Shows data types and counts of non-null values.

In [None]:
data.info()

5.3 Check for Missing Values
Explanation:
    Identifies columns with missing values.

In [None]:
missing_values = data.isnull().sum()
print(missing_values)

Step 6: Preprocess the Data
6.1 Handle Missing Values
    Dataset is largely complete, will not need to drop columns with too many missing values or create synthetic data via mean
    Skip Step 6.1

6.2 Convert Non-Numeric Columns to Numeric

Identify any non-numeric columns that need to be converted.
Possible non-numeric columns could be 'Label' or others.

In [None]:
# Identify non-numeric columns
non_numeric_cols = data.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

6.3 Encode Categorical Variables
    6.3.1 Encode the Target Variable

Assuming the target variable is 'Label'.

Explanation:
    This shows the different classes in the target variable.

In [None]:
# Check unique values in the Label column
print(data['Label'].unique())

1.1 Encode Labels for Binary Classification

In this step, we'll map all attack types to `Attack` and benign traffic to `Benign`. We'll then encode these labels into numerical values.

The Label Values are as follows: 
    ['BENIGN' 'Infiltration' 'Bot' 'PortScan' 'DDoS' 'FTP-Patator',
     'SSH-Patator' 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk', 
     'DoS GoldenEye' 'Heartbleed' 'Web Attack - Brute Force',
     'Web Attack - XSS' 'Web Attack - Sql Injection']

Explanation:
    We create a new column `Label_binary` where all attack types are mapped to `Attack` and benign traffic to `Benign`.
    `LabelEncoder` is used to convert these categorical labels into numerical values (0 and 1).
    We print the label mapping to verify that `Attack` and `Benign` are correctly encoded.

In [9]:
# Encode labels for binary classification
data['Label_binary'] = data['Label'].apply(lambda x: 'Benign' if x == 'BENIGN' else 'Attack')

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Label_encoded'] = le.fit_transform(data['Label_binary'])

# Display label encoding mapping
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Encoding Mapping:")
for label, encoding in label_mapping.items():
    print(f"{label}: {encoding}")

1.2 Prepare Features (X) and Target (y)

We'll separate our dataset into features and target variables.

Explanation:
    `X` contains all the features used for training, excluding the original and encoded labels.
    `y` is our target variable containing the encoded labels (`0` for **Attack**, `1` for **Benign**).

In [11]:
# Features (drop unnecessary columns)
X = data.drop(['Label', 'Label_binary', 'Label_encoded'], axis=1, errors='ignore')

# Target variable
y = data['Label_encoded']

1.3 Handle Non-Numeric Features

We need to ensure all features are numeric.

Explanation:
    We check for any non-numeric columns in `X`.
    If non-numeric columns are found, we apply one-hot encoding to convert them into numeric format.

In [None]:
# Identify non-numeric columns
non_numeric_cols = X.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols.tolist())

# Encode non-numeric features
if len(non_numeric_cols) > 0:
    X = pd.get_dummies(X, columns=non_numeric_cols)

1.4 Split Data into Training and Testing Sets

We'll split our data into training and testing sets to evaluate model performance.

Explanation:
    We use `train_test_split` to split the data.
    `test_size=0.2` reserves 20% of the data for testing.
    `stratify=y` ensures the class distribution is consistent in both training and testing sets.
    `random_state=42` ensures reproducibility.

In [13]:
from sklearn.model_selection import train_test_split

# Split data with stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

Step 2: Handling Infinite and Missing Values

2.1 Replace Infinite Values and Handle Missing Data

We need to replace infinite values and handle any missing data in our features.

Explanation:
    Infinite values are replaced with `NaN` to handle them appropriately.
    We check for missing values in `X_train` and `X_test`.
    `SimpleImputer` is used to fill missing values with the mean of each feature, calculated from the training set.

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer

# Replace infinite values with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for NaN values
print("Checking for NaN values in X_train and X_test:")
print(f"X_train contains NaN values: {X_train.isnull().values.any()}")
print(f"X_test contains NaN values: {X_test.isnull().values.any()}")

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')

# Fit on X_train and transform both X_train and X_test
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

Step 3: Feature Scaling
3.1 Apply StandardScaler

We scale the features to normalize the data.

Explanation:
    `StandardScaler` standardizes features by removing the mean and scaling to unit variance.
    We fit the scaler on `X_train` and transform both `X_train` and `X_test` to prevent data leakage.

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

Step 4: Addressing Class Imbalance
4.1 Oversample Minority Class using SMOTE

We address class imbalance by oversampling the minority class ('Attack') using **SMOTE** (Synthetic Minority Oversampling Technique).

Explanation:
    `SMOTE` generates synthetic samples of the minority class to balance the dataset.
    `sampling_strategy='auto'` balances all classes to the number of samples in the majority class.
    We apply `SMOTE` only to the training data to avoid data leakage.
    We print class distributions before and after resampling to verify the changes.

In [None]:
# Baseline Model: RandomForestClassifier without class weights
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train the baseline model
baseline_model = RandomForestClassifier(n_estimators=100, random_state=42)
baseline_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_baseline = baseline_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_baseline)
print(f"Baseline Model Accuracy: {accuracy:.4f}")

print("Baseline Model Classification Report:")
print(classification_report(y_test, y_pred_baseline, target_names=['Attack', 'Benign']))

print("Baseline Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_baseline))

In [None]:
# Model with class_weight='balanced'
model_class_weight = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model_class_weight.fit(X_train_scaled, y_train)

# Make predictions
y_pred_class_weight = model_class_weight.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_class_weight)
print(f"Model with Class Weight Accuracy: {accuracy:.4f}")

print("Model with Class Weight Classification Report:")
print(classification_report(y_test, y_pred_class_weight, target_names=['Attack', 'Benign']))

print("Model with Class Weight Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_class_weight))

In [None]:
from imblearn.over_sampling import SMOTE

# Define the SMOTE object
smote = SMOTE(random_state=42)

# Resample the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("After SMOTE oversampling:")
print(f"Original y_train distribution: {np.bincount(y_train)}")
print(f"Resampled y_train distribution: {np.bincount(y_train_resampled)}")

# Train the model on resampled data
model_smote = RandomForestClassifier(n_estimators=100, random_state=42)
model_smote.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred_smote = model_smote.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_smote)
print(f"Model with SMOTE Accuracy: {accuracy:.4f}")

print("Model with SMOTE Classification Report:")
print(classification_report(y_test, y_pred_smote, target_names=['Attack', 'Benign']))

print("Model with SMOTE Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_smote))

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model_logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
model_logreg.fit(X_train_scaled, y_train)

# Make predictions
y_pred_logreg = model_logreg.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg, target_names=['Attack', 'Benign']))

print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))

In [None]:
# Import the XGBoost classifier
from xgboost import XGBClassifier

# Calculate the scale_pos_weight parameter
from collections import Counter
counter = Counter(y_train)
ratio = counter[1] / counter[0]
print(f"Scale_pos_weight ratio: {ratio}")

# Initialize and train the model
model_xgb = XGBClassifier(scale_pos_weight=ratio, use_label_encoder=False, eval_metric='logloss', random_state=42)
model_xgb.fit(X_train_scaled, y_train)

# Make predictions
y_pred_xgb = model_xgb.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Classifier Accuracy: {accuracy:.4f}")

print("XGBoost Classifier Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Attack', 'Benign']))

print("XGBoost Classifier Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))


In [None]:
# Get prediction probabilities
y_probs = baseline_model.predict_proba(X_test_scaled)[:, 1]  # Probability of class 'Benign' (label 1)

# Define thresholds to evaluate
thresholds = [0.3, 0.5, 0.7]

for thresh in thresholds:
    # Predict based on adjusted threshold
    y_pred_thresh = (y_probs >= thresh).astype(int)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred_thresh)
    print(f"Threshold {thresh} - Accuracy: {accuracy:.4f}")
    print(f"Threshold {thresh} - Classification Report:")
    print(classification_report(y_test, y_pred_thresh, target_names=['Attack', 'Benign']))
    print(f"Threshold {thresh} - Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred_thresh))
    print("-" * 50)

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Initialize stratified k-fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validate the model
cv_scores = cross_val_score(baseline_model, X_train_scaled, y_train, cv=skf, scoring='f1')

print(f"Cross-Validation F1 Scores: {cv_scores}")
print(f"Mean F1 Score: {cv_scores.mean():.4f}")

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Get prediction probabilities
y_probs_baseline = baseline_model.predict_proba(X_test_scaled)[:, 1]  # Probability of 'Benign'

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_probs_baseline, pos_label=1)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label=f'Baseline Model ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='grey', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Baseline Model')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Create a DataFrame to store metrics
import pandas as pd

metrics = []

# Baseline Model Metrics
metrics.append({
    'Model': 'Baseline RandomForest',
    'Accuracy': accuracy_score(y_test, y_pred_baseline),
    'Precision': precision_score(y_test, y_pred_baseline),
    'Recall': recall_score(y_test, y_pred_baseline),
    'F1-Score': f1_score(y_test, y_pred_baseline),
    'AUC': roc_auc_score(y_test, y_probs_baseline)
})

# Model with Class Weight Metrics
y_probs_class_weight = model_class_weight.predict_proba(X_test_scaled)[:, 1]
metrics.append({
    'Model': 'RandomForest with Class Weight',
    'Accuracy': accuracy_score(y_test, y_pred_class_weight),
    'Precision': precision_score(y_test, y_pred_class_weight),
    'Recall': recall_score(y_test, y_pred_class_weight),
    'F1-Score': f1_score(y_test, y_pred_class_weight),
    'AUC': roc_auc_score(y_test, y_probs_class_weight)
})

# SMOTE Model Metrics
y_probs_smote = model_smote.predict_proba(X_test_scaled)[:, 1]
metrics.append({
    'Model': 'RandomForest with SMOTE',
    'Accuracy': accuracy_score(y_test, y_pred_smote),
    'Precision': precision_score(y_test, y_pred_smote),
    'Recall': recall_score(y_test, y_pred_smote),
    'F1-Score': f1_score(y_test, y_pred_smote),
    'AUC': roc_auc_score(y_test, y_probs_smote)
})

# Logistic Regression Metrics
y_probs_logreg = model_logreg.predict_proba(X_test_scaled)[:, 1]
metrics.append({
    'Model': 'Logistic Regression',
    'Accuracy': accuracy_score(y_test, y_pred_logreg),
    'Precision': precision_score(y_test, y_pred_logreg),
    'Recall': recall_score(y_test, y_pred_logreg),
    'F1-Score': f1_score(y_test, y_pred_logreg),
    'AUC': roc_auc_score(y_test, y_probs_logreg)
})

# XGBoost Classifier Metrics
y_probs_xgb = model_xgb.predict_proba(X_test_scaled)[:, 1]
metrics.append({
    'Model': 'XGBoost Classifier',
    'Accuracy': accuracy_score(y_test, y_pred_xgb),
    'Precision': precision_score(y_test, y_pred_xgb),
    'Recall': recall_score(y_test, y_pred_xgb),
    'F1-Score': f1_score(y_test, y_pred_xgb),
    'AUC': roc_auc_score(y_test, y_probs_xgb)
})

# Create DataFrame
metrics_df = pd.DataFrame(metrics)
print(metrics_df)

In [None]:
# Plotting the metrics
import seaborn as sns

# Melt the DataFrame for easier plotting
metrics_melted = metrics_df.melt(id_vars='Model', var_name='Metric', value_name='Value')

plt.figure(figsize=(12, 6))
sns.barplot(data=metrics_melted, x='Metric', y='Value', hue='Model')
plt.title('Model Comparison')
plt.ylabel('Score')
plt.show()
