<a href="https://colab.research.google.com/github/shahdcode/Credit-Card-Approval-Prediction/blob/main/Genetic_Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Algrotihms


## Particle Swarm Optimization (PSO)

In [4]:
# def fitness_function(features, X, y, classifier):
#     """
#     Evaluate the fitness of a particle.

#     Args:
#         features (array): Binary array representing selected features.
#         X (ndarray): Feature matrix.
#         y (ndarray): Target array.
#         classifier: A machine learning model (e.g., DecisionTreeClassifier).

#     Returns:
#         float: The fitness value (lower is better for minimization tasks).
#     """
#     # Select features based on the binary array
#     selected_features = X[:, features == 1]

#     # If no features are selected, return a high fitness value
#     if selected_features.shape[1] == 0:
#         return float('inf')

#     # Perform a simple train-test split (80%-20%)
#     X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.2, random_state=42)

#     # Train the classifier
#     classifier.fit(X_train, y_train)

#     # Predict and calculate accuracy
#     y_pred = classifier.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)

#     # Return the negative accuracy as fitness (minimization task)
#     return -accuracy



def fitness_function(features, X, y, classifier):
    """
    Evaluate the fitness of a particle.

    Args:
        features (array): Binary array representing selected features.
        X (ndarray): Feature matrix.
        y (ndarray): Target array.
        classifier: A machine learning model (e.g., DecisionTreeClassifier).

    Returns:
        float: The fitness value (lower is better for minimization tasks).
    """
    # Select features based on the binary array
    selected_features = X.iloc[:, features == 1]

    # If no features are selected, return a high fitness value
    if selected_features.shape[1] == 0:
        return float('inf')

    # Perform a simple train-test split (80%-20%)
    X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.2, random_state=42)

    # Train the classifier
    classifier.fit(X_train, y_train)

    # Predict and calculate accuracy
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Return the negative accuracy as fitness (minimization task)
    return -accuracy





In [5]:
def initialize_swarm(n_particles, n_features, bounds, fitness_function, X, y, classifier):
    """
    Initialize the swarm for PSO.

    Args:
        n_particles (int): Number of particles.
        n_features (int): Number of features.
        bounds (tuple): Bounds for the feature mask (binary: 0 or 1).
        fitness_function (callable): The fitness function.
        X (ndarray): Feature matrix.
        y (ndarray): Target array.
        classifier: A machine learning model.

    Returns:
        positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness
    """
    lower_bounds, upper_bounds = bounds

    # Randomly initialize positions (binary: 0 or 1) and velocities
    positions = np.random.randint(lower_bounds, upper_bounds + 1, size=(n_particles, n_features))
    velocities = np.random.uniform(-1, 1, (n_particles, n_features))

    # Evaluate initial fitness
    fitness = np.array([fitness_function(positions[i], X, y, classifier) for i in range(n_particles)])

    # Set personal bests (pbest)
    pbest_positions = positions.copy()
    pbest_fitness = fitness.copy()

    # Set global best (gbest)
    gbest_index = np.argmin(pbest_fitness)
    gbest_position = pbest_positions[gbest_index]
    gbest_fitness = pbest_fitness[gbest_index]

    return positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness


In [6]:
# def return_best_solution(gbest_position, gbest_fitness, feature_names):
#     """
#     Return the best solution found by the swarm.

#     Args:
#         gbest_position (array): The best feature mask.
#         gbest_fitness (float): The best fitness value.
#         feature_names (list): List of feature names.

#     Returns:
#         dict: A dictionary containing selected features and their fitness.
#     """
#     selected_features = [feature_names[i] for i in range(len(gbest_position)) if gbest_position[i] == 1]
#     return {
#         "Selected Features": selected_features,
#         "Best Fitness": -gbest_fitness  # Accuracy is the positive value of fitness
#     }

def return_best_solution(gbest_position, gbest_fitness, feature_names):
    """
    Return the best solution found by the swarm.

    Args:
        gbest_position (array): The best feature mask.
        gbest_fitness (float): The best fitness value.
        feature_names (list): List of feature names.

    Returns:
        dict: A dictionary containing selected features and their fitness.
    """
    selected_features = [feature_names[i] for i in range(len(feature_names)) if gbest_position[i] == 1]
    return {
        "Selected Features": selected_features,
        "Best Fitness": -gbest_fitness  # Accuracy is the positive value of fitness
    }


In [7]:
# Shahd
def update_swarm(positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position, w, c1, c2, bounds, X, y, classifier):
    lower_bounds, upper_bounds = bounds
    n_particles, dimensions = positions.shape

    # Temporarily cast positions to float64 for update step
    positions = positions.astype(np.float64)

    for i in range(n_particles):
        # Update velocity
        inertia = w * velocities[i]
        cognitive = c1 * np.random.random() * (pbest_positions[i] - positions[i])
        social = c2 * np.random.random() * (gbest_position - positions[i])
        velocities[i] = inertia + cognitive + social

        # Update position
        positions[i] += velocities[i]

        # Ensure positions stay within bounds (binary: 0 or 1)
        positions[i] = np.clip(positions[i], lower_bounds, upper_bounds)

    # After update, cast positions back to int64 for binary (0, 1)
    positions = np.round(positions).astype(np.int64)

    # Evaluate fitness
    fitness = np.array([fitness_function(positions[i], X, y, classifier) for i in range(n_particles)])

    # Update pbest
    for i in range(n_particles):
        if fitness[i] < pbest_fitness[i]:
            pbest_fitness[i] = fitness[i]
            pbest_positions[i] = positions[i]

    # Update gbest
    gbest_index = np.argmin(fitness)
    if fitness[gbest_index] < np.min(pbest_fitness):
        gbest_position = positions[gbest_index]

    return positions, velocities, pbest_positions, pbest_fitness, gbest_position




## Genetic Algorithm Implementation

In [8]:
#Step 1 - Shahd
def encode_solution_space(num_features):
    """Creates binary encoding for feature selection."""
    return np.random.choice([0, 1], size=num_features)


In [9]:
#Step 2 - Habiba
def set_algorithm_parameters():
    """Set the parameters for the Genetic Algorithm."""
    return {
        "pop_size": 100,
        "num_generations": 150,
        "crossover_rate": 0.9,
        "mutation_rate": 0.02
    }

In [10]:
#Step 3 - Shahd
def create_initial_population(pop_size, num_features):
    """Generates the initial population of chromosomes."""
    return [encode_solution_space(num_features) for _ in range(pop_size)]

In [11]:
#Step 4 - Shahd

def measure_fitness(population, X_train, y_train, X_val, y_val, dynamic_k):
    """Evaluates the fitness of each chromosome with a dynamic k value."""
    fitness_scores = []
    for chromosome in population:
        selected_features = [i for i, bit in enumerate(chromosome) if bit == 1]
        if not selected_features:
            fitness_scores.append(0)
            continue
        X_train_selected = X_train.iloc[:, selected_features]
        X_val_selected = X_val.iloc[:, selected_features]

        # Train the model with the dynamic k value
        model = KNeighborsClassifier(n_neighbors=dynamic_k)
        model.fit(X_train_selected, y_train)
        predictions = model.predict(X_val_selected)
        fitness_scores.append(accuracy_score(y_val, predictions))
    return fitness_scores


# def measure_fitness(population, X_train, y_train, X_val, y_val, best_k=5):
#     """Evaluates the fitness of each chromosome (feature selection) in the population."""
#     fitness_scores = []
#     for chromosome in population:
#         selected_features = [i for i, bit in enumerate(chromosome) if bit == 1]
#         if not selected_features:
#             fitness_scores.append(0)
#             continue
#         X_train_selected = X_train.iloc[:, selected_features]
#         X_val_selected = X_val.iloc[:, selected_features]

#         # Train the model with the default value for k (e.g., k=5)
#         model = KNeighborsClassifier(n_neighbors=best_k)
#         model.fit(X_train_selected, y_train)
#         predictions = model.predict(X_val_selected)
#         fitness_scores.append(accuracy_score(y_val, predictions))
#     return fitness_scores


In [12]:
#Step 5 - Habiba
# def select_parents(population, fitness_values):
#     """Select parents using Tournament Selection."""
#     parents = []
#     for _ in range(len(population)):
#         # Tournament selection: Randomly pick 3 and select the best
#         candidates_idx = np.random.choice(len(population), 3, replace=False)
#         best_candidate = max(candidates_idx, key=lambda idx: fitness_values[idx])
#         parents.append(population[best_candidate])
#     return np.array(parents)




def select_parents(population, fitness_values):
    """Select parents using Roulette Wheel Selection."""
    total_fitness = sum(fitness_values)
    selection_probs = [f / total_fitness for f in fitness_values]
    parents = np.random.choice(population, size=len(population), p=selection_probs)
    return np.array(parents)


In [13]:
#Step 6 - Habiba
def crossover(parents, crossover_rate):
    """Apply single-point crossover."""
    offspring = []
    for i in range(0, len(parents), 2):
        parent1, parent2 = parents[i], parents[(i + 1) % len(parents)]
        if np.random.rand() < crossover_rate:
            point = np.random.randint(1, len(parent1))
            child1 = np.concatenate([parent1[:point], parent2[point:]])
            child2 = np.concatenate([parent2[:point], parent1[point:]])
            offspring.extend([child1, child2])
        else:
            offspring.extend([parent1, parent2])
    return np.array(offspring)


In [14]:
#Step 7 - Habiba
def populate_next_generation(offspring, fitness_values):
    """Replace population with offspring and apply elitism."""
    elite_idx = np.argmax(fitness_values)
    next_generation = offspring
    next_generation[0] = offspring[elite_idx]  # Ensure elite survives
    return next_generation

In [15]:
#Step 8 - Habiba
def mutate(offspring, mutation_rate):
    """Apply bit-flip mutation."""
    for i in range(len(offspring)):
        for j in range(len(offspring[i])):
            if np.random.rand() < mutation_rate:
                offspring[i][j] = 1 - offspring[i][j]  # Flip bit
    return offspring


In [16]:
#Step 9 - Shahd
# def check_stopping_condition(generation, max_generations, fitness_scores, threshold, no_change_limit, previous_scores):
#     """Checks if the algorithm should stop based on fitness, generations, or stagnation."""
#     if generation >= max_generations or max(fitness_scores) >= threshold:
#         return True
#     if len(previous_scores) >= no_change_limit and all(score == previous_scores[0] for score in previous_scores):
#         return True
#     return False


def check_stopping_condition(generation, max_generations, fitness_scores, threshold, no_change_limit, previous_scores):
    """Checks if the algorithm should stop based on fitness, generations, or stagnation."""
    if generation >= max_generations or max(fitness_scores) >= threshold:
        return True
    if len(previous_scores) >= no_change_limit and all(score == previous_scores[0] for score in previous_scores):
        return True
    return False


In [17]:
def genetic_algorithm(X_train, y_train, X_val, y_val, num_features):
    """Runs the genetic algorithm for feature selection with a dynamic k."""
    params = set_algorithm_parameters()
    params['no_change_limit'] = 10  # Define stagnation limit
    population = create_initial_population(params['pop_size'], num_features)
    previous_scores = []

    for generation in range(params['num_generations']):
        # Dynamically compute the best k for the current generation
        dynamic_k, _ = find_optimal_k(X_train, y_train)

        # Evaluate fitness with the dynamic k value
        fitness_scores = measure_fitness(population, X_train, y_train, X_val, y_val, dynamic_k)

        if check_stopping_condition(generation, params['num_generations'], fitness_scores, 0.95, params['no_change_limit'], previous_scores):
            break

        previous_scores.append(max(fitness_scores))
        if len(previous_scores) > params['no_change_limit']:
            previous_scores.pop(0)

        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents, params['crossover_rate'])
        offspring = mutate(offspring, params['mutation_rate'])
        population = populate_next_generation(offspring, fitness_scores)

    best_solution = population[np.argmax(fitness_scores)]
    return best_solution, max(fitness_scores)  # Return best solution and fitness score



## Knn

In [20]:
def find_optimal_k(X_train, y_train):
    from sklearn.model_selection import GridSearchCV

    param_grid = {'n_neighbors': list(range(1, 31))}
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_['n_neighbors'], grid_search.best_score_


In [21]:
def train_knn_and_display_accuracy(X_train, y_train, X_val, y_val, X_test, y_test, best_k):
    # Initialize the KNN model with the optimal k
    knn_model = KNeighborsClassifier(n_neighbors=best_k)

    # Train the model on the training set
    knn_model.fit(X_train, y_train)

    # Predict on the validation set
    y_val_pred = knn_model.predict(X_val)

    # Predict on the test set
    y_test_pred = knn_model.predict(X_test)

    # Calculate accuracy on the validation set
    val_accuracy = accuracy_score(y_val, y_val_pred)

    # Calculate accuracy on the test set
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Display the accuracies
    print(f"Validation Accuracy: {val_accuracy}")
    print(f"Test Accuracy: {test_accuracy}")

In [22]:
from sklearn.model_selection import train_test_split

def separate_features_targets(train_df, val_df, test_df):
    """
    Function to separate features and target from three datasets (train, validation, and test).

    Parameters:
    - train_df: DataFrame for training data
    - val_df: DataFrame for validation data
    - test_df: DataFrame for test data

    Returns:
    - X_train, X_val, X_test: Features for train, validation, and test sets
    - y_train, y_val, y_test: Target labels for train, validation, and test sets
    """
    # Separate features and target for training set
    X_train = train_df.drop(columns=['status_mapped'])
    y_train = train_df['status_mapped']

    # Separate features and target for validation set
    X_val = val_df.drop(columns=['status_mapped'])
    y_val = val_df['status_mapped']

    # Separate features and target for test set
    X_test = test_df.drop(columns=['status_mapped'])
    y_test = test_df['status_mapped']

    # Return all the variables
    return X_train, X_val, X_test, y_train, y_val, y_test



In [81]:
train_scaled_df=pd.read_csv('/content/drive/MyDrive/Ai project/train_scaled.csv')
val_scaled=pd.read_csv('/content/drive/MyDrive/Ai project/validation_scaled.csv')
test_scaled=pd.read_csv('/content/drive/MyDrive/Ai project/test_scaled.csv')

# Load the data

# Drop the 'ID' column
train_scaled_df = train_scaled_df.drop(columns=['ID'])
val_scaled = val_scaled.drop(columns=['ID'])
test_scaled = test_scaled.drop(columns=['ID'])

In [82]:
# Check the column names of the scaled datasets
print("Train DataFrame Columns:", train_scaled_df.columns)
print("Validation DataFrame Columns:", val_scaled.columns)
print("Test DataFrame Columns:", test_scaled.columns)



Train DataFrame Columns: Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',
       'CNT_FAM_MEMBERS', 'AGE_YEARS', 'YEARS_EMPLOYED', 'status_mapped'],
      dtype='object')
Validation DataFrame Columns: Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',
       'CNT_FAM_MEMBERS', 'AGE_YEARS', 'YEARS_EMPLOYED', 'status_mapped'],
      dtype='object')
Test DataFrame Columns: Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STAT

In [24]:
# Call the split_data function
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df, val_scaled, test_scaled)

# Run PSO for feature selection (same as previous code)
classifier = KNeighborsClassifier()
n_particles = 30
n_features = X_train.shape[1]
bounds = (0, 1)  # Binary feature selection

positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness = initialize_swarm(
    n_particles, n_features, bounds, fitness_function, X_train, y_train, classifier
)

# Track the PSO process using tqdm
for _ in tqdm(range(100), desc="PSO Iterations"):  # Max iterations
    positions, velocities, pbest_positions, pbest_fitness, gbest_position = update_swarm(
        positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position,
        w=0.5, c1=1.5, c2=1.5, bounds=bounds, X=X_train, y=y_train, classifier=classifier
    )

pso_result = return_best_solution(gbest_position, gbest_fitness, X_train.columns)
selected_features_pso = pso_result["Selected Features"]
print("PSO Best Features:", selected_features_pso)
print("PSO Best Fitness (Accuracy):", pso_result["Best Fitness"])

# Filter training, validation, and test sets by PSO-selected features
X_train_selected_pso = X_train[selected_features_pso]
X_val_selected_pso = X_val[selected_features_pso]
X_test_selected_pso = X_test[selected_features_pso]

# Train and evaluate KNN with PSO-selected features
best_k, best_score = find_optimal_k(X_train_selected_pso, y_train)
print(f"Optimal k (PSO): {best_k} with cross-validation accuracy: {best_score}")

knn_model_pso = KNeighborsClassifier(n_neighbors=best_k)
knn_model_pso.fit(X_train_selected_pso, y_train)
y_pred_pso = knn_model_pso.predict(X_val_selected_pso)
accuracy_pso = accuracy_score(y_val, y_pred_pso)
print(f"Accuracy of KNN with PSO-selected features and k={best_k}: {accuracy_pso}")

NameError: name 'train_scaled_df' is not defined

In [None]:
# Separate features and targets
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df, val_scaled, test_scaled)

# Run Genetic Algorithm for feature selection (without passing best_k)
ga_best_solution, ga_best_fitness = genetic_algorithm(X_train, y_train, X_val, y_val, num_features=X_train.shape[1])

# Print the best solution from the genetic algorithm
print("GA Best Solution:", ga_best_solution)

# Get the names of the selected features (those with a 1 in ga_best_solution)
ga_selected_columns = [X_train.columns[i] for i, bit in enumerate(ga_best_solution) if bit == 1]

# Print the selected columns
print("GA Selected Columns:", ga_selected_columns)

# Filter the training, validation, and test sets by GA-selected features using column names
X_train_selected_ga = X_train[ga_selected_columns]
X_val_selected_ga = X_val[ga_selected_columns]
X_test_selected_ga = X_test[ga_selected_columns]

# Convert to numpy arrays if needed
X_train_selected_ga = X_train_selected_ga.values
X_val_selected_ga = X_val_selected_ga.values
X_test_selected_ga = X_test_selected_ga.values

# Print the GA selected features
print("GA Selected Features:", ga_selected_columns)
print("GA Best Fitness (Accuracy):", ga_best_fitness)

# Find optimal k for KNN after feature selection
best_k, best_score = find_optimal_k(X_train_selected_ga, y_train)
print(f"Optimal k (GA): {best_k} with cross-validation accuracy: {best_score}")

# Train the KNN model and display the accuracy
train_knn_and_display_accuracy(X_train_selected_ga, y_train, X_val_selected_ga, y_val, X_test_selected_ga, y_test, best_k)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Random Forest

In [None]:
from tqdm import tqdm
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Function to find the optimal number of estimators for Random Forest
def find_optimal_estimators(X_train, y_train):
    param_grid = {'n_estimators': list(range(10, 201, 10))}
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_['n_estimators'], grid_search.best_score_

# Function to split the data
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df,val_scaled,test_scaled)

# PSO for feature selection (placeholder functions for initialization and update)
classifier = RandomForestClassifier(random_state=42)
n_particles = 30
n_features = X_train.shape[1]
bounds = (0, 1)  # Binary feature selection

positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness = initialize_swarm(
    n_particles, n_features, bounds, fitness_function, X_train, y_train, classifier
)

# Track the PSO process using tqdm
for _ in tqdm(range(100), desc="PSO Iterations"):
    positions, velocities, pbest_positions, pbest_fitness, gbest_position = update_swarm(
        positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position,
        w=0.5, c1=1.5, c2=1.5, bounds=bounds, X=X_train, y=y_train, classifier=classifier
    )

pso_result = return_best_solution(gbest_position, gbest_fitness, df.drop(columns=['status_mapped']).columns)
selected_features_pso = pso_result["Selected Features"]
print("PSO Best Features:", selected_features_pso)
print("PSO Best Fitness (Accuracy):", pso_result["Best Fitness"])

# Filter training, validation, and test sets by PSO-selected features
X_train_selected_pso = X_train[:, gbest_position.astype(bool)]
X_val_selected_pso = X_val[:, gbest_position.astype(bool)]
X_test_selected_pso = X_test[:, gbest_position.astype(bool)]

# Train and evaluate Random Forest with PSO-selected features
best_estimators, best_score = find_optimal_estimators(X_train_selected_pso, y_train)
print(f"Optimal number of estimators (PSO): {best_estimators} with cross-validation accuracy: {best_score}")

rf_model_pso = RandomForestClassifier(n_estimators=best_estimators, random_state=42)
rf_model_pso.fit(X_train_selected_pso, y_train)
y_pred_pso = rf_model_pso.predict(X_val_selected_pso)
accuracy_pso = accuracy_score(y_val, y_pred_pso)
print(f"Accuracy of Random Forest with PSO-selected features: {accuracy_pso}")

# Genetic Algorithm (GA) for feature selection
# Placeholder genetic_algorithm function
# ga_best_solution, ga_best_fitness = genetic_algorithm(X_train, y_train, X_val, y_val, num_features=X_train.shape[1])
# ga_selected_features = [df.drop(columns=['status_mapped']).columns[i] for i, bit in enumerate(ga_best_solution) if bit == 1]
# print("GA Best Features:", ga_selected_features)
# print("GA Best Fitness (Accuracy):", ga_best_fitness)

# # Filter training, validation, and test sets by GA-selected features
# X_train_selected_ga = X_train[:, ga_best_solution.astype(bool)]
# X_val_selected_ga = X_val[:, ga_best_solution.astype(bool)]
# X_test_selected_ga = X_test[:, ga_best_solution.astype(bool)]

# # Train and evaluate Random Forest with GA-selected features
# best_estimators, best_score = find_optimal_estimators(X_train_selected_ga, y_train)
# print(f"Optimal number of estimators (GA): {best_estimators} with cross-validation accuracy: {best_score}")

# rf_model_ga = RandomForestClassifier(n_estimators=best_estimators, random_state=42)
# rf_model_ga.fit(X_train_selected_ga, y_train)
# y_pred_ga = rf_model_ga.predict(X_val_selected_ga)
# accuracy_ga = accuracy_score(y_val, y_pred_ga)
# print(f"Accuracy of Random Forest with GA-selected features: {accuracy_ga}")

# # Save the datasets as CSV
# train_df = pd.concat([pd.DataFrame(X_train), pd.Series(y_train, name='status_mapped')], axis=1)
# val_df = pd.concat([pd.DataFrame(X_val), pd.Series(y_val, name='status_mapped')], axis=1)
# test_df = pd.concat([pd.DataFrame(X_test), pd.Series(y_test, name='status_mapped')], axis=1)

# train_df.to_csv('/content/drive/MyDrive/Ai project/train_rf.csv', index=False)
# val_df.to_csv('/content/drive/MyDrive/Ai project/validation_rf.csv', index=False)
# test_df.to_csv('/content/drive/MyDrive/Ai project/test_rf.csv', index=False)

# print("Datasets saved successfully:")
# print(f"Train set: {train_df.shape}")
# print(f"Validation set: {val_df.shape}")
# print(f"Test set: {test_df.shape}")


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Function to find the optimal C and kernel for SVM
def find_optimal_svm(X_train, y_train):
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_, grid_search.best_score_

# Load the data
df = pd.read_csv('/content/drive/MyDrive/Ai project/Feature_eng_updated.csv')

# Define features and target
X = df.drop(columns=['status_mapped']).values  # Features as numpy array
y = df['status_mapped'].values                # Target (label)

# Split data
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df,val_scaled,test_scaled)

# Run PSO for feature selection
classifier = SVC()
n_particles = 30
n_features = X_train.shape[1]
bounds = (0, 1)  # Binary feature selection

positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness = initialize_swarm(
    n_particles, n_features, bounds, fitness_function, X_train, y_train, classifier
)

# Track the PSO process using tqdm
for _ in tqdm(range(100), desc="PSO Iterations"):  # Max iterations
    positions, velocities, pbest_positions, pbest_fitness, gbest_position = update_swarm(
        positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position,
        w=0.5, c1=1.5, c2=1.5, bounds=bounds, X=X_train, y=y_train, classifier=classifier
    )

pso_result = return_best_solution(gbest_position, gbest_fitness, df.drop(columns=['status_mapped']).columns)
selected_features_pso = pso_result["Selected Features"]
print("PSO Best Features:", selected_features_pso)
print("PSO Best Fitness (Accuracy):", pso_result["Best Fitness"])

# Filter training, validation, and test sets by PSO-selected features
X_train_selected_pso = X_train[:, gbest_position.astype(bool)]
X_val_selected_pso = X_val[:, gbest_position.astype(bool)]
X_test_selected_pso = X_test[:, gbest_position.astype(bool)]

# Find optimal parameters for SVM after PSO feature selection
best_params, best_score = find_optimal_svm(X_train_selected_pso, y_train)
print(f"Optimal parameters (PSO): {best_params} with cross-validation accuracy: {best_score}")

# Train and evaluate SVM with PSO-selected features
svm_model_pso = SVC(**best_params)
svm_model_pso.fit(X_train_selected_pso, y_train)
y_pred_pso = svm_model_pso.predict(X_val_selected_pso)
accuracy_pso = accuracy_score(y_val, y_pred_pso)
print(f"Accuracy of SVM with PSO-selected features: {accuracy_pso}")

# # Run Genetic Algorithm for feature selection (without passing best_params)
# ga_best_solution, ga_best_fitness = genetic_algorithm(X_train, y_train, X_val, y_val, num_features=X_train.shape[1])
# ga_selected_features = [df.drop(columns=['status_mapped']).columns[i] for i, bit in enumerate(ga_best_solution) if bit == 1]
# print("GA Best Features:", ga_selected_features)
# print("GA Best Fitness (Accuracy):", ga_best_fitness)

# # Filter training, validation, and test sets by GA-selected features
# X_train_selected_ga = X_train[:, ga_best_solution.astype(bool)]
# X_val_selected_ga = X_val[:, ga_best_solution.astype(bool)]
# X_test_selected_ga = X_test[:, ga_best_solution.astype(bool)]

# # Find optimal parameters for SVM after GA feature selection
# best_params, best_score = find_optimal_svm(X_train_selected_ga, y_train)
# print(f"Optimal parameters (GA): {best_params} with cross-validation accuracy: {best_score}")

# # Train and evaluate SVM with GA-selected features
# svm_model_ga = SVC(**best_params)
# svm_model_ga.fit(X_train_selected_ga, y_train)
# y_pred_ga = svm_model_ga.predict(X_val_selected_ga)
# accuracy_ga = accuracy_score(y_val, y_pred_ga)
# print(f"Accuracy of SVM with GA-selected features: {accuracy_ga}")

# # Save the splits as CSV files (if needed, combine X and y for each split)
# train_df = pd.concat([X_train, y_train], axis=1)
# val_df = pd.concat([X_val, y_val], axis=1)
# test_df = pd.concat([X_test, y_test], axis=1)

# train_df.to_csv('/content/drive/MyDrive/Ai project/train_svm.csv', index=False)
# val_df.to_csv('/content/drive/MyDrive/Ai project/validation_svm.csv', index=False)
# test_df.to_csv('/content/drive/MyDrive/Ai project/test_svm.csv', index=False)

# print("Datasets saved successfully:")
# print(f"Train set: {train_df.shape}")
# print(f"Validation set: {val_df.shape}")
# print(f"Test set: {test_df.shape}")


# Decision Trees

In [30]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
train_scaled_df=pd.read_csv('/content/drive/MyDrive/Ai project/train_scaled.csv')
val_scaled=pd.read_csv('/content/drive/MyDrive/Ai project/validation_scaled.csv')
test_scaled=pd.read_csv('/content/drive/MyDrive/Ai project/test_scaled.csv')

# Load the data

# Drop the 'ID' column
train_scaled_df = train_scaled_df.drop(columns=['ID'])
val_scaled = val_scaled.drop(columns=['ID'])
test_scaled = test_scaled.drop(columns=['ID'])

In [31]:
# Decision trees model implementing genetic feature selection and Particle Swarm Optimization as well as Grid Search:

def train_decision_tree(X_train, y_train, X_val, y_val, X_test, y_test, param_grid):
    dt_model = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    train_score = accuracy_score(y_train, best_model.predict(X_train))
    val_score = accuracy_score(y_val, best_model.predict(X_val))
    test_score = accuracy_score(y_test, best_model.predict(X_test))

    print(f"Best Parameters (Decision Tree): {best_params}")
    print(f"Training Accuracy: {train_score}")
    print(f"Validation Accuracy: {val_score}")
    print(f"Testing Accuracy: {test_score}")

In [34]:
# Feature selection using Genetic Algorithm
# Step 1: Separate features and targets
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df, val_scaled, test_scaled)

# Step 2: Run Genetic Algorithm for feature selection
ga_best_solution, ga_best_fitness = genetic_algorithm(X_train, y_train, X_val, y_val, num_features=X_train.shape[1])

# Step 3: Print the best solution from the genetic algorithm
print("GA Best Solution:", ga_best_solution)

# Step 4: Get the names of the selected features
ga_selected_columns = [X_train.columns[i] for i, bit in enumerate(ga_best_solution) if bit == 1]

# Step 5: Print the selected columns
print("GA Selected Columns:", ga_selected_columns)

# Step 6: Filter the training, validation, and test sets by GA-selected features
X_train_selected_ga = X_train[ga_selected_columns]
X_val_selected_ga = X_val[ga_selected_columns]
X_test_selected_ga = X_test[ga_selected_columns]

# Step 7: Convert to numpy arrays if needed
X_train_selected_ga = X_train_selected_ga.values
X_val_selected_ga = X_val_selected_ga.values
X_test_selected_ga = X_test_selected_ga.values

# Step 8: Print the GA selected features
print("GA Selected Features:", ga_selected_columns)
print("GA Best Fitness (Accuracy):", ga_best_fitness)



KeyboardInterrupt: 

In [101]:
# Feature selection using Particle Swarm Optimization
classifier_dt = DecisionTreeClassifier()
positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness = initialize_swarm(
    n_particles, n_features, bounds, fitness_function, X_train, y_train, classifier_dt
)

for _ in tqdm(range(100), desc="PSO Iterations for Decision Tree"):
    positions, velocities, pbest_positions, pbest_fitness, gbest_position = update_swarm(
        positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position,
        w=0.5, c1=1.5, c2=1.5, bounds=bounds, X=X_train, y=y_train, classifier=classifier_dt
    )

pso_result = return_best_solution(gbest_position, gbest_fitness, X_train.columns)
selected_features_pso = pso_result["Selected Features"]
X_train_selected_pso = X_train[selected_features_pso]
X_val_selected_pso = X_val[selected_features_pso]
X_test_selected_pso = X_test[selected_features_pso]

PSO Iterations for Decision Tree: 100%|██████████| 100/100 [01:27<00:00,  1.15it/s]


In [94]:
# Define the parameter grid for tuning Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': [None, 'sqrt', 'log2']
}

# Train Decision Tree with GA-selected features
print("Training Decision Tree with GA-selected features:")
train_decision_tree(X_train_selected_ga, y_train, X_val_selected_ga, y_val, X_test_selected_ga, y_test, param_grid)

# Train Decision Tree with PSO-selected features
print("\nTraining Decision Tree with PSO-selected features:")
train_decision_tree(X_train_selected_pso, y_train, X_val_selected_pso, y_val, X_test_selected_pso, y_test, param_grid)

Training Decision Tree with GA-selected features:
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters (Decision Tree): {'criterion': 'entropy', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 2}
Training Accuracy: 0.9499600390729065
Validation Accuracy: 0.9498549523414836
Testing Accuracy: 0.949658172778123

Training Decision Tree with PSO-selected features:
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters (Decision Tree): {'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Training Accuracy: 0.9498712370126987
Validation Accuracy: 0.9498549523414836
Testing Accuracy: 0.9498653407913819


# MLP

In [44]:
import tensorflow as tf

def train_mlp(X_train, y_train, X_val, y_val, X_test, y_test):
    # Normalize the data
    X_train = tf.keras.utils.normalize(X_train, axis=1)
    X_val = tf.keras.utils.normalize(X_val, axis=1)
    X_test = tf.keras.utils.normalize(X_test, axis=1)

    # Define the MLP model
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(128, activation='sigmoid', input_shape=(X_train.shape[1],)))  # Input layer
    model.add(tf.keras.layers.Dense(64, activation='sigmoid'))  # Hidden layer
    model.add(tf.keras.layers.Dense(len(np.unique(y_train)), activation='softmax'))  # Output layer (multi-class)

    # Compile the model
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

    # Save the model
    model.save('mlp.keras')

    # Load the model
    model = tf.keras.models.load_model('mlp.keras')

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test)

    print(f"Test Accuracy: {accuracy:.2f}")
    print(f"Test Loss: {loss:.2f}")



In [39]:
# Feature selection using Genetic Algorithms for MLP


# Step 1: Separate features and targets
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df, val_scaled, test_scaled)

# Step 2: Run Genetic Algorithm for feature selection
ga_best_solution, ga_best_fitness = genetic_algorithm(X_train, y_train, X_val, y_val, num_features=X_train.shape[1])

# Step 3: Print the best solution from the genetic algorithm
print("GA Best Solution:", ga_best_solution)

# Step 4: Get the names of the selected features
ga_selected_columns = [X_train.columns[i] for i, bit in enumerate(ga_best_solution) if bit == 1]

# Step 5: Print the selected columns
print("GA Selected Columns:", ga_selected_columns)

# Step 6: Filter the training, validation, and test sets by GA-selected features
X_train_selected_ga = X_train[ga_selected_columns]
X_val_selected_ga = X_val[ga_selected_columns]
X_test_selected_ga = X_test[ga_selected_columns]

# Step 7: Convert to numpy arrays if needed
X_train_selected_ga = X_train_selected_ga.values
X_val_selected_ga = X_val_selected_ga.values
X_test_selected_ga = X_test_selected_ga.values

# Step 8: Print the GA selected features
print("GA Selected Features:", ga_selected_columns)
print("GA Best Fitness (Accuracy):", ga_best_fitness)

GA Best Solution: [0 0 0 1 0 0 1 1 1 0 0 0 1 0 1 1 0]
GA Selected Columns: ['CNT_CHILDREN', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'AGE_YEARS']
GA Selected Features: ['CNT_CHILDREN', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'AGE_YEARS']
GA Best Fitness (Accuracy): 0.9500621632822213


In [47]:
# Import necessary libraries
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
import numpy as np

def initialize_swarm(n_particles, n_features, bounds, fitness_function, X, y, classifier):
    # Initialize positions and velocities randomly
    positions = np.random.uniform(bounds[0], bounds[1], (n_particles, n_features))
    velocities = np.random.uniform(-1, 1, (n_particles, n_features))

    # Evaluate initial fitness
    fitness = np.array([fitness_function(positions[i], X, y, classifier) for i in range(n_particles)])

    # Set personal bests (pbest)
    pbest_positions = np.copy(positions)
    pbest_fitness = fitness.copy()

    # Set global best (gbest)
    gbest_position = positions[np.argmax(fitness)]
    gbest_fitness = np.max(fitness)

    return positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness

# Call the split_data function
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df, val_scaled, test_scaled)

# Define the classifier
classifier = MLPClassifier(hidden_layer_sizes=(30,), max_iter=300, random_state=42)  # Simplified network with more iterations

# Initialize PSO parameters
n_particles = 10  # Reduce the number of particles
n_features = X_train.shape[1]
bounds = (0, 1)  # Binary feature selection

# def fitness_function(position, X, y, classifier):
#     # Select features based on the position
#     selected_features = [i for i, bit in enumerate(position) if bit > 0.5]
#     if len(selected_features) == 0:
#         return 0

#     X_selected = X.iloc[:, selected_features]

#     # Train the classifier
#     classifier.fit(X_selected, y)

#     # Perform cross-validation
#     scores = cross_val_score(classifier, X_selected, y, cv=3, n_jobs=-1)  # Enable parallel processing and reduce CV folds

#     # Return the mean accuracy
#     return scores.mean()

# Initialize swarm
positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness = initialize_swarm(
    n_particles, n_features, bounds, fitness_function, X_train, y_train, classifier
)

# Track the PSO process using tqdm
for _ in tqdm(range(30), desc="PSO Iterations"):  # Further reduce the number of iterations
    positions, velocities, pbest_positions, pbest_fitness, gbest_position = update_swarm(
        positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position,
        w=0.5, c1=1.5, c2=1.5, bounds=bounds, X=X_train, y=y_train, classifier=classifier
    )

# Retrieve the best solution
pso_result = return_best_solution(gbest_position, gbest_fitness, X_train.columns)
selected_features_pso = pso_result["Selected Features"]
print("PSO Best Features:", selected_features_pso)
print("PSO Best Fitness (Accuracy):", pso_result["Best Fitness"])

# Filter training, validation, and test sets by PSO-selected features
X_train_selected_pso = X_train[selected_features_pso]
X_val_selected_pso = X_val[selected_features_pso]
X_test_selected_pso = X_test[selected_features_pso]

# Train and evaluate MLP with PSO-selected features
mlp_model_pso = MLPClassifier(hidden_layer_sizes=(30,), max_iter=300, random_state=42)
mlp_model_pso.fit(X_train_selected_pso, y_train)
y

PSO Iterations:   3%|▎         | 1/30 [04:16<2:04:03, 256.67s/it]


KeyboardInterrupt: 

In [45]:
# Train MLP with GA-selected features
print("Training MLP with GA-selected features:")
train_mlp(X_train_selected_ga, y_train, X_val_selected_ga, y_val, X_test_selected_ga, y_test)

# # Train MLP with PSO-selected features
# print("\nTraining MLP with PSO-selected features:")
# train_mlp(X_train_selected_pso, y_train, X_val_selected_pso, y_val, X_test_selected_pso, y_test)


Training MLP with GA-selected features:
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9501 - loss: 0.2133 - val_accuracy: 0.9499 - val_loss: 0.1991
Epoch 2/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9494 - loss: 0.2017 - val_accuracy: 0.9499 - val_loss: 0.1991
Epoch 3/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9509 - loss: 0.1970 - val_accuracy: 0.9499 - val_loss: 0.1986
Epoch 4/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9511 - loss: 0.1965 - val_accuracy: 0.9499 - val_loss: 0.1989
Epoch 5/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9505 - loss: 0.1976 - val_accuracy: 0.9499 - val_loss: 0.1986
Epoch 6/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9495 - loss: 0.2010 - val_accuracy: 0.9499 - val_loss: 0.1985
Epoch 7/10
[1m704/704[0m [32m━━━━━━━