In [None]:
!pip install scikit-learn pandas numpy



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B.csv"
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
    "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
    "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]

# Load dataset
data = pd.read_csv(url, header=None, names=column_names)

# Encode categorical features
label_encoder = LabelEncoder()
data['protocol_type'] = label_encoder.fit_transform(data['protocol_type'])
data['service'] = label_encoder.fit_transform(data['service'])
data['flag'] = label_encoder.fit_transform(data['flag'])
data['label'] = label_encoder.fit_transform(data['label'])

# Separate features and labels
X = data.drop(columns=['label'])
y = data['label']

# Ensure all columns are numeric
X = X.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric values to NaN
X = X.dropna(axis=1)  # Drop columns with NaN values (if any)

# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Preprocessing completed!")
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Preprocessing completed!
Shape of X_train: (88181, 39)
Shape of X_test: (37792, 39)


In [None]:
import random
import math

class SimulatedAnnealingFeatureSelection:
    def __init__(self, initial_temperature=1000, cooling_rate=0.95, max_iterations=100):
        self.initial_temperature = initial_temperature
        self.cooling_rate = cooling_rate
        self.max_iterations = max_iterations

    def fit(self, X, y):
        n_features = X.shape[1]
        current_solution = np.random.choice([0, 1], size=n_features)  # Binary vector (1 = selected, 0 = not selected)
        current_score = self._evaluate_solution(X, y, current_solution)

        best_solution = current_solution
        best_score = current_score

        temperature = self.initial_temperature

        for iteration in range(self.max_iterations):
            # Generate a neighbor solution
            neighbor_solution = self._get_neighbor(current_solution)
            neighbor_score = self._evaluate_solution(X, y, neighbor_solution)

            # Accept the neighbor solution with a probability
            if neighbor_score > current_score or random.random() < math.exp((neighbor_score - current_score) / temperature):
                current_solution = neighbor_solution
                current_score = neighbor_score

            # Update the best solution
            if current_score > best_score:
                best_solution = current_solution
                best_score = current_score

            # Cool down the temperature
            temperature *= self.cooling_rate

        self.best_solution = best_solution
        self.best_score = best_score

    def _get_neighbor(self, solution):
        # Flip a random bit in the solution to generate a neighbor
        neighbor = solution.copy()
        index = random.randint(0, len(neighbor) - 1)
        neighbor[index] = 1 - neighbor[index]  # Flip 0 to 1 or 1 to 0
        return neighbor

    def _evaluate_solution(self, X, y, solution):
        # Evaluate the solution using a classifier (e.g., Random Forest)
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.metrics import accuracy_score

        selected_features = np.where(solution == 1)[0]
        if len(selected_features) == 0:
            return 0  # No features selected

        model = RandomForestClassifier(n_estimators=10, random_state=42)
        model.fit(X[:, selected_features], y)
        y_pred = model.predict(X[:, selected_features])
        return accuracy_score(y, y_pred)

    def get_selected_features(self):
        return np.where(self.best_solution == 1)[0]

# Run Simulated Annealing
sa = SimulatedAnnealingFeatureSelection(initial_temperature=1000, cooling_rate=0.95, max_iterations=100)
sa.fit(X_train, y_train)
selected_features = sa.get_selected_features()
print("Selected Features:", selected_features)

Selected Features: [ 0  2  3  4  7  8 10 14 16 18 21 22 23 24 26 27 28 29 31 32 33 34 36 37]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train on selected features
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_selected, y_train)

# Evaluate
y_pred = model.predict(X_test_selected)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8532758255715496
              precision    recall  f1-score   support

           0       0.90      0.53      0.67        17
           1       0.58      0.58      0.58        19
           2       0.62      0.47      0.53        17
           3       0.69      0.44      0.54        25
           4       0.58      0.54      0.56        28
           5       0.61      0.42      0.50        26
           6       0.34      0.44      0.39        27
           7       0.44      0.37      0.41        43
           8       0.65      0.35      0.46        37
           9       0.62      0.60      0.61        52
          10       0.47      0.47      0.47        79
          11       0.73      0.73      0.73       212
          12       0.64      0.67      0.65       209
          13       0.53      0.42      0.47       130
          14       0.73      0.64      0.68       225
          15       0.69      0.77      0.73      1184
          16       0.57      0.53      0.55       74