In [1]:
import pandas as pd

# Load the dataset. (Uploaded on my github repo)
df = pd.read_csv('https://raw.githubusercontent.com/shubhamhgnis91/nsl-kdd-ga/refs/heads/main/dataset.csv')

# display the first few rows
df.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [2]:
# make new column for attack. 0 for normal, 1 for attack
isAttack = df['attack'] != 'normal'
isAttack = isAttack.astype(int)
df['attackFlag'] = isAttack

df.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level,attackFlag
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,0
1,0,udp,other,SF,146,0,0,0,0,0,...,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,0
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,1
3,0,tcp,http,SF,232,8153,0,0,0,0,...,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,0
4,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,0


In [3]:
from sklearn.calibration import LabelEncoder

# encode categorical features using one-hot encoding
featuresToEncode = ['protocol_type', 'service', 'flag']
df = pd.get_dummies(df, columns=featuresToEncode)

# encode attack column into numerical values
labelEncoder = LabelEncoder()
attackEncoder = LabelEncoder()
df['attack'] = attackEncoder.fit_transform(df["attack"])


In [4]:
from sklearn.model_selection import train_test_split

# drop the columns which are related to the target variable
X = df.drop(['attackFlag','level','attack'], axis=1)
y = df['attackFlag']

# split data into train and test sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
import numpy as np
from random import randint, random, sample

# parameters for the genetic algorithm
populationSize = 20
numGenerations = 50
crossoverRate = 0.8
mutationRate = 0.1
useElitism = True

In [6]:
# number of features to select for features
numFeatures = train_X.shape[1]

# initialize population with random feature selections (1 = selected, 0 = not selected)
population = np.random.randint(2, size=(populationSize, numFeatures))

In [7]:
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier

# fitness function to evaluate a subset of features. 
def fitnessFunction(individual):

    # use only the selected features
    selectedFeatures = []

    for i in range(numFeatures):
        if individual[i] == 1:
            selectedFeatures.append(i)

    if not selectedFeatures:  # if no features selected, assign 0 fitness
        return 0

    # train a DecisionTreeClassifier with the selected features
    model = DecisionTreeClassifier()
    model.fit(train_X.iloc[:, selectedFeatures], train_y)
    predictions = model.predict(test_X.iloc[:, selectedFeatures])

    # use F1 score as the fitness metric for the selected features
    return f1_score(test_y, predictions)

# GA operations: selection, crossover, mutation, elitism
def selectParents(population, fitness):

    # tournament selection with 3 individuals
    parents = []
    for _ in range(2):
        tournament = sample(range(len(population)), k=3)

        tournamentFitness = []
        for i in tournament:
            tournamentFitness.append(fitness[i])

        bestIndividual = tournament[np.argmax(tournamentFitness)]
        
        parents.append(population[bestIndividual])
    return parents[0], parents[1]

# single-point crossover
def crossover(parent1, parent2):
    if random() < crossoverRate:
        point = randint(1, numFeatures - 1)
        child1 = np.concatenate([parent1[:point], parent2[point:]])
        child2 = np.concatenate([parent2[:point], parent1[point:]])
        return child1, child2
    return parent1, parent2

# mutation by flipping a bit
def mutate(individual):
    for i in range(numFeatures):
        if random() < mutationRate:
            individual[i] = 1 - individual[i]  # Flip bit
    return individual


In [8]:
# run GA
for generation in range(numGenerations):

    # evaluate fitness of each individual in the population
    fitness = []
    for individual in population:
        fitnessValue = fitnessFunction(individual)
        fitness.append(fitnessValue)

    fitness = np.array(fitness)
    
    # elitism: keep the best individual
    if useElitism:
        best_index = np.argmax(fitness)
        best_individual = population[best_index].copy()
        best_fitness = fitness[best_index]
    
    # generate new population
    new_population = []
    while len(new_population) < populationSize:

        # parent selection
        parent1, parent2 = selectParents(population, fitness)
        
        # crossover
        child1, child2 = crossover(parent1, parent2)
        
        # mutation
        child1 = mutate(child1)
        child2 = mutate(child2)
        
        new_population.extend([child1, child2])
    
    # ensure population size remains constant
    new_population = new_population[:populationSize]
    population = np.array(new_population)
    
    # replace worst individual with the best (elitism)
    if useElitism:        
        fitnessValues = []
        for ind in population:
            fitnessValue = fitnessFunction(ind)
            fitnessValues.append(fitnessValue)

        worst_index = np.argmin(fitnessValues)

        population[worst_index] = best_individual

    # print best fitness in each generation
    print(f"Generation {generation + 1}, Best F1 score: {best_fitness}")

Generation 1, Best F1 score: 0.9940006315124723
Generation 2, Best F1 score: 0.9940645523829592
Generation 3, Best F1 score: 0.9946959851066072
Generation 4, Best F1 score: 0.9946264882520284


In [None]:
# After GA, select the best features
final_fitness = []
for individual in population:
    fitness_value = fitnessFunction(individual)
    final_fitness.append(fitness_value)

final_fitness = np.array(final_fitness)

best_solution = population[np.argmax(final_fitness)]

selected_features = []
for i, bit in enumerate(best_solution):
    if bit == 1:
        selected_features.append(i)


print("Selected Feature Indices:", selected_features)

Selected Feature Indices: [0, 1, 2, 3, 8, 9, 11, 12, 13, 17, 18, 21, 22, 24, 25, 26, 29, 30, 31, 32, 34, 38, 41, 42, 44, 45, 46, 47, 48, 49, 50, 53, 55, 56, 57, 61, 62, 63, 64, 67, 70, 73, 78, 79, 84, 85, 87, 89, 91, 92, 93, 94, 96, 99, 100, 101, 103, 107, 108, 112, 114, 117, 118]
