In [1]:
# Installing scikit-learn
!pip install scikit-learn



In [2]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

data = pd.read_csv('UNSW_NB15.csv')
data = data.sample(n=10000, random_state=42)
# One hot encodin
# Assuming 'data' is your dataset containing both numerical and categorical columns
# Use the appropriate columns that contain categorical data
categorical_cols = ['proto', 'service','state']  # Replace with your categorical column names

# Perform one-hot encoding for categorical columns
data_encoded = pd.get_dummies(data, columns=categorical_cols)

# Split the encoded data into features and target
features_encoded = data_encoded.drop('attack_cat', axis=1)  # Replace 'target_column' with your target column name
target_encoded = data_encoded['attack_cat']  # Replace 'target_column' with your target column name

# Split the encoded data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target_encoded, test_size=0.2, random_state=42)


In [3]:
# Initialize and train the classifier on the encoded training data
# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

In [4]:
# Predict on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8425
Confusion Matrix:
[[  3   0   4  14   3   0   0   0   0   0]
 [  2   1   9   8   3   0   0   2   0   0]
 [  6   3  40  59   5   1   0   4   0   0]
 [  9   5  55 243  11   4   0  12   2   1]
 [  3   2   9  23 143   1   0   4   2   0]
 [  0   0   2   7   2 451   0   0   0   0]
 [  0   0   0   0   0   0 725   0   0   0]
 [  0   2   7  15   8   0   0  78   0   0]
 [  0   0   0   4   1   1   0   0   1   0]
 [  0   0   0   0   0   0   0   0   0   0]]
Classification Report:
                precision    recall  f1-score   support

      Analysis       0.13      0.12      0.13        24
      Backdoor       0.08      0.04      0.05        25
           DoS       0.32      0.34      0.33       118
      Exploits       0.65      0.71      0.68       342
       Fuzzers       0.81      0.76      0.79       187
       Generic       0.98      0.98      0.98       462
        Normal       1.00      1.00      1.00       725
Reconnaissance       0.78      0.71      0.74       110
     S

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def pso(objective_function, lb, ub, swarmsize=100, maxiter=100, c1=1.5, c2=1.5, w=0.5):
    dimensions = len(lb)
    
    # Initialize particles
    particles = np.random.uniform(lb, ub, (swarmsize, dimensions))
    velocities = np.zeros((swarmsize, dimensions))
    
    # Initialize personal best positions and values
    pbest_positions = particles.copy()
    pbest_values = np.array([objective_function(p) for p in particles])
    
    # Initialize global best position and value
    gbest_index = np.argmin(pbest_values)
    gbest_position = pbest_positions[gbest_index].copy()
    gbest_value = pbest_values[gbest_index]
    
    for _ in range(maxiter):
        for i in range(swarmsize):
            # Update velocity
            r1, r2 = np.random.rand(dimensions), np.random.rand(dimensions)
            velocities[i] = w * velocities[i] + c1 * r1 * (pbest_positions[i] - particles[i]) + c2 * r2 * (gbest_position - particles[i])
            
            # Update particle position
            particles[i] += velocities[i]
            
            # Clip particle position to be within bounds
            particles[i] = np.clip(particles[i], lb, ub)
            
            # Evaluate the objective function
            current_value = objective_function(particles[i])
            
            # Update personal best if needed
            if current_value < pbest_values[i]:
                pbest_values[i] = current_value
                pbest_positions[i] = particles[i].copy()
                
                # Update global best if needed
                if current_value < gbest_value:
                    gbest_value = current_value
                    gbest_position = particles[i].copy()
        
    return gbest_position, gbest_value

In [6]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter space for Decision Tree
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 3, 5, 7, 10],
    'min_samples_leaf': [1, 2, 3, 5, 8],
    'criterion': ['gini', 'entropy']
}

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Grid Search CV to perform the initial search
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Extract the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 7}


In [7]:
# Define the objective function for PSO
def objective_function(params):
    # Define hyperparameter names corresponding to their positions in the array
    hyperparameters = {
        'max_depth': int(params[0]),
        'min_samples_split': int(params[1]),
        'min_samples_leaf': int(params[2]),
        'criterion': ['gini', 'entropy'][int(params[3])]
    }
    
    # Create a DecisionTreeClassifier with the specified hyperparameters
    clf = DecisionTreeClassifier(random_state=42, **hyperparameters)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return -accuracy_score(y_test, y_pred)  # Negative accuracy for minimization

# Define bounds for each hyperparameter
lower_bounds = [1, 2, 1, 0]  # Minimum values for max_depth, min_samples_split, min_samples_leaf, criterion index
upper_bounds = [30, 10, 4, 1]  # Maximum values for max_depth, min_samples_split, min_samples_leaf, criterion index

# Perform PSO for hyperparameter optimization
best_params_pso, _ = pso(objective_function, lower_bounds, upper_bounds, swarmsize=10, maxiter=50)

Stopping search: maximum iterations reached --> 50


In [8]:
# Print optimized parameters
optimal_params = {
    'max_depth': int(best_params_pso[0]),
    'min_samples_split': int(best_params_pso[1]),
    'min_samples_leaf': int(best_params_pso[2]),
    'criterion': ['gini', 'entropy'][int(best_params_pso[3])]
}
print("Optimized Hyperparameters:", optimal_params)

# Train a Decision Tree Classifier with optimized hyperparameters
clf_optimized = DecisionTreeClassifier(random_state=42, **optimal_params)
clf_optimized.fit(X_train, y_train)

# Predict and evaluate with optimized model
y_pred_optimized = clf_optimized.predict(X_test)
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
print(f"Accuracy with optimized hyperparameters: {accuracy_optimized}")

Optimized Hyperparameters: {'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 4, 'criterion': 'gini'}
Accuracy with optimized hyperparameters: 0.859
