<a href="https://colab.research.google.com/github/shahdcode/Credit-Card-Approval-Prediction/blob/main/Genetic_Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [91]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Algrotihms


## Particle Swarm Optimization (PSO)

In [101]:
# def fitness_function(features, X, y, classifier):
#     """
#     Evaluate the fitness of a particle.

#     Args:
#         features (array): Binary array representing selected features.
#         X (ndarray): Feature matrix.
#         y (ndarray): Target array.
#         classifier: A machine learning model (e.g., DecisionTreeClassifier).

#     Returns:
#         float: The fitness value (lower is better for minimization tasks).
#     """
#     # Select features based on the binary array
#     selected_features = X[:, features == 1]

#     # If no features are selected, return a high fitness value
#     if selected_features.shape[1] == 0:
#         return float('inf')

#     # Perform a simple train-test split (80%-20%)
#     X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.2, random_state=42)

#     # Train the classifier
#     classifier.fit(X_train, y_train)

#     # Predict and calculate accuracy
#     y_pred = classifier.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)

#     # Return the negative accuracy as fitness (minimization task)
#     return -accuracy



def fitness_function(features, X, y, classifier):
    """
    Evaluate the fitness of a particle.

    Args:
        features (array): Binary array representing selected features.
        X (ndarray): Feature matrix.
        y (ndarray): Target array.
        classifier: A machine learning model (e.g., DecisionTreeClassifier).

    Returns:
        float: The fitness value (lower is better for minimization tasks).
    """
    # Select features based on the binary array
    selected_features = X.iloc[:, features == 1]

    # If no features are selected, return a high fitness value
    if selected_features.shape[1] == 0:
        return float('inf')

    # Perform a simple train-test split (80%-20%)
    X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.2, random_state=42)

    # Train the classifier
    classifier.fit(X_train, y_train)

    # Predict and calculate accuracy
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Return the negative accuracy as fitness (minimization task)
    return -accuracy





In [93]:
def initialize_swarm(n_particles, n_features, bounds, fitness_function, X, y, classifier):
    """
    Initialize the swarm for PSO.

    Args:
        n_particles (int): Number of particles.
        n_features (int): Number of features.
        bounds (tuple): Bounds for the feature mask (binary: 0 or 1).
        fitness_function (callable): The fitness function.
        X (ndarray): Feature matrix.
        y (ndarray): Target array.
        classifier: A machine learning model.

    Returns:
        positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness
    """
    lower_bounds, upper_bounds = bounds

    # Randomly initialize positions (binary: 0 or 1) and velocities
    positions = np.random.randint(lower_bounds, upper_bounds + 1, size=(n_particles, n_features))
    velocities = np.random.uniform(-1, 1, (n_particles, n_features))

    # Evaluate initial fitness
    fitness = np.array([fitness_function(positions[i], X, y, classifier) for i in range(n_particles)])

    # Set personal bests (pbest)
    pbest_positions = positions.copy()
    pbest_fitness = fitness.copy()

    # Set global best (gbest)
    gbest_index = np.argmin(pbest_fitness)
    gbest_position = pbest_positions[gbest_index]
    gbest_fitness = pbest_fitness[gbest_index]

    return positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness


In [104]:
# def return_best_solution(gbest_position, gbest_fitness, feature_names):
#     """
#     Return the best solution found by the swarm.

#     Args:
#         gbest_position (array): The best feature mask.
#         gbest_fitness (float): The best fitness value.
#         feature_names (list): List of feature names.

#     Returns:
#         dict: A dictionary containing selected features and their fitness.
#     """
#     selected_features = [feature_names[i] for i in range(len(gbest_position)) if gbest_position[i] == 1]
#     return {
#         "Selected Features": selected_features,
#         "Best Fitness": -gbest_fitness  # Accuracy is the positive value of fitness
#     }

def return_best_solution(gbest_position, gbest_fitness, feature_names):
    """
    Return the best solution found by the swarm.

    Args:
        gbest_position (array): The best feature mask.
        gbest_fitness (float): The best fitness value.
        feature_names (list): List of feature names.

    Returns:
        dict: A dictionary containing selected features and their fitness.
    """
    selected_features = [feature_names[i] for i in range(len(feature_names)) if gbest_position[i] == 1]
    return {
        "Selected Features": selected_features,
        "Best Fitness": -gbest_fitness  # Accuracy is the positive value of fitness
    }


In [95]:
# Shahd
def update_swarm(positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position, w, c1, c2, bounds, X, y, classifier):
    lower_bounds, upper_bounds = bounds
    n_particles, dimensions = positions.shape

    # Temporarily cast positions to float64 for update step
    positions = positions.astype(np.float64)

    for i in range(n_particles):
        # Update velocity
        inertia = w * velocities[i]
        cognitive = c1 * np.random.random() * (pbest_positions[i] - positions[i])
        social = c2 * np.random.random() * (gbest_position - positions[i])
        velocities[i] = inertia + cognitive + social

        # Update position
        positions[i] += velocities[i]

        # Ensure positions stay within bounds (binary: 0 or 1)
        positions[i] = np.clip(positions[i], lower_bounds, upper_bounds)

    # After update, cast positions back to int64 for binary (0, 1)
    positions = np.round(positions).astype(np.int64)

    # Evaluate fitness
    fitness = np.array([fitness_function(positions[i], X, y, classifier) for i in range(n_particles)])

    # Update pbest
    for i in range(n_particles):
        if fitness[i] < pbest_fitness[i]:
            pbest_fitness[i] = fitness[i]
            pbest_positions[i] = positions[i]

    # Update gbest
    gbest_index = np.argmin(fitness)
    if fitness[gbest_index] < np.min(pbest_fitness):
        gbest_position = positions[gbest_index]

    return positions, velocities, pbest_positions, pbest_fitness, gbest_position




## Genetic Algorithm Implementation

In [53]:
#Step 1 - Shahd
def encode_solution_space(num_features):
    """Creates binary encoding for feature selection."""
    return np.random.choice([0, 1], size=num_features)


In [54]:
#Step 2 - Habiba
def set_algorithm_parameters():
    """Set the parameters for the Genetic Algorithm."""
    return {
        "pop_size": 100,
        "num_generations": 150,
        "crossover_rate": 0.9,
        "mutation_rate": 0.02
    }

In [55]:
#Step 3 - Shahd
def create_initial_population(pop_size, num_features):
    """Generates the initial population of chromosomes."""
    return [encode_solution_space(num_features) for _ in range(pop_size)]

In [107]:
#Step 4 - Shahd

def measure_fitness(population, X_train, y_train, X_val, y_val, dynamic_k):
    """Evaluates the fitness of each chromosome with a dynamic k value."""
    fitness_scores = []
    for chromosome in population:
        selected_features = [i for i, bit in enumerate(chromosome) if bit == 1]
        if not selected_features:
            fitness_scores.append(0)
            continue
        X_train_selected = X_train.iloc[:, selected_features]
        X_val_selected = X_val.iloc[:, selected_features]

        # Train the model with the dynamic k value
        model = KNeighborsClassifier(n_neighbors=dynamic_k)
        model.fit(X_train_selected, y_train)
        predictions = model.predict(X_val_selected)
        fitness_scores.append(accuracy_score(y_val, predictions))
    return fitness_scores


# def measure_fitness(population, X_train, y_train, X_val, y_val, best_k=5):
#     """Evaluates the fitness of each chromosome (feature selection) in the population."""
#     fitness_scores = []
#     for chromosome in population:
#         selected_features = [i for i, bit in enumerate(chromosome) if bit == 1]
#         if not selected_features:
#             fitness_scores.append(0)
#             continue
#         X_train_selected = X_train.iloc[:, selected_features]
#         X_val_selected = X_val.iloc[:, selected_features]

#         # Train the model with the default value for k (e.g., k=5)
#         model = KNeighborsClassifier(n_neighbors=best_k)
#         model.fit(X_train_selected, y_train)
#         predictions = model.predict(X_val_selected)
#         fitness_scores.append(accuracy_score(y_val, predictions))
#     return fitness_scores


In [57]:
#Step 5 - Habiba
# def select_parents(population, fitness_values):
#     """Select parents using Tournament Selection."""
#     parents = []
#     for _ in range(len(population)):
#         # Tournament selection: Randomly pick 3 and select the best
#         candidates_idx = np.random.choice(len(population), 3, replace=False)
#         best_candidate = max(candidates_idx, key=lambda idx: fitness_values[idx])
#         parents.append(population[best_candidate])
#     return np.array(parents)




def select_parents(population, fitness_values):
    """Select parents using Roulette Wheel Selection."""
    total_fitness = sum(fitness_values)
    selection_probs = [f / total_fitness for f in fitness_values]
    parents = np.random.choice(population, size=len(population), p=selection_probs)
    return np.array(parents)


In [58]:
#Step 6 - Habiba
def crossover(parents, crossover_rate):
    """Apply single-point crossover."""
    offspring = []
    for i in range(0, len(parents), 2):
        parent1, parent2 = parents[i], parents[(i + 1) % len(parents)]
        if np.random.rand() < crossover_rate:
            point = np.random.randint(1, len(parent1))
            child1 = np.concatenate([parent1[:point], parent2[point:]])
            child2 = np.concatenate([parent2[:point], parent1[point:]])
            offspring.extend([child1, child2])
        else:
            offspring.extend([parent1, parent2])
    return np.array(offspring)


In [59]:
#Step 7 - Habiba
def populate_next_generation(offspring, fitness_values):
    """Replace population with offspring and apply elitism."""
    elite_idx = np.argmax(fitness_values)
    next_generation = offspring
    next_generation[0] = offspring[elite_idx]  # Ensure elite survives
    return next_generation

In [60]:
#Step 8 - Habiba
def mutate(offspring, mutation_rate):
    """Apply bit-flip mutation."""
    for i in range(len(offspring)):
        for j in range(len(offspring[i])):
            if np.random.rand() < mutation_rate:
                offspring[i][j] = 1 - offspring[i][j]  # Flip bit
    return offspring


In [61]:
#Step 9 - Shahd
# def check_stopping_condition(generation, max_generations, fitness_scores, threshold, no_change_limit, previous_scores):
#     """Checks if the algorithm should stop based on fitness, generations, or stagnation."""
#     if generation >= max_generations or max(fitness_scores) >= threshold:
#         return True
#     if len(previous_scores) >= no_change_limit and all(score == previous_scores[0] for score in previous_scores):
#         return True
#     return False


def check_stopping_condition(generation, max_generations, fitness_scores, threshold, no_change_limit, previous_scores):
    """Checks if the algorithm should stop based on fitness, generations, or stagnation."""
    if generation >= max_generations or max(fitness_scores) >= threshold:
        return True
    if len(previous_scores) >= no_change_limit and all(score == previous_scores[0] for score in previous_scores):
        return True
    return False


In [108]:
def genetic_algorithm(X_train, y_train, X_val, y_val, num_features):
    """Runs the genetic algorithm for feature selection with a dynamic k."""
    params = set_algorithm_parameters()
    params['no_change_limit'] = 10  # Define stagnation limit
    population = create_initial_population(params['pop_size'], num_features)
    previous_scores = []

    for generation in range(params['num_generations']):
        # Dynamically compute the best k for the current generation
        dynamic_k, _ = find_optimal_k(X_train, y_train)

        # Evaluate fitness with the dynamic k value
        fitness_scores = measure_fitness(population, X_train, y_train, X_val, y_val, dynamic_k)

        if check_stopping_condition(generation, params['num_generations'], fitness_scores, 0.95, params['no_change_limit'], previous_scores):
            break

        previous_scores.append(max(fitness_scores))
        if len(previous_scores) > params['no_change_limit']:
            previous_scores.pop(0)

        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents, params['crossover_rate'])
        offspring = mutate(offspring, params['mutation_rate'])
        population = populate_next_generation(offspring, fitness_scores)

    best_solution = population[np.argmax(fitness_scores)]
    return best_solution, max(fitness_scores)  # Return best solution and fitness score



## Knn

In [96]:
def find_optimal_k(X_train, y_train):
    from sklearn.model_selection import GridSearchCV

    param_grid = {'n_neighbors': list(range(1, 31))}
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_['n_neighbors'], grid_search.best_score_


In [97]:
def train_knn_and_display_accuracy(X_train, y_train, X_val, y_val, X_test, y_test, best_k):
    # Initialize the KNN model with the optimal k
    knn_model = KNeighborsClassifier(n_neighbors=best_k)

    # Train the model on the training set
    knn_model.fit(X_train, y_train)

    # Predict on the validation set
    y_val_pred = knn_model.predict(X_val)

    # Predict on the test set
    y_test_pred = knn_model.predict(X_test)

    # Calculate accuracy on the validation set
    val_accuracy = accuracy_score(y_val, y_val_pred)

    # Calculate accuracy on the test set
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Display the accuracies
    print(f"Validation Accuracy: {val_accuracy}")
    print(f"Test Accuracy: {test_accuracy}")

In [98]:
from sklearn.model_selection import train_test_split

def separate_features_targets(train_df, val_df, test_df):
    """
    Function to separate features and target from three datasets (train, validation, and test).

    Parameters:
    - train_df: DataFrame for training data
    - val_df: DataFrame for validation data
    - test_df: DataFrame for test data

    Returns:
    - X_train, X_val, X_test: Features for train, validation, and test sets
    - y_train, y_val, y_test: Target labels for train, validation, and test sets
    """
    # Separate features and target for training set
    X_train = train_df.drop(columns=['status_mapped'])
    y_train = train_df['status_mapped']

    # Separate features and target for validation set
    X_val = val_df.drop(columns=['status_mapped'])
    y_val = val_df['status_mapped']

    # Separate features and target for test set
    X_test = test_df.drop(columns=['status_mapped'])
    y_test = test_df['status_mapped']

    # Return all the variables
    return X_train, X_val, X_test, y_train, y_val, y_test



In [111]:
train_scaled_df=pd.read_csv('/content/drive/MyDrive/Ai project/train_scaled.csv')
val_scaled=pd.read_csv('/content/drive/MyDrive/Ai project/validation_scaled.csv')
test_scaled=pd.read_csv('/content/drive/MyDrive/Ai project/test_scaled.csv')

# Load the data

# Drop the 'ID' column
train_scaled_df = train_scaled_df.drop(columns=['ID'])
val_scaled = val_scaled.drop(columns=['ID'])
test_scaled = test_scaled.drop(columns=['ID'])

In [112]:
# Check the column names of the scaled datasets
print("Train DataFrame Columns:", train_scaled_df.columns)
print("Validation DataFrame Columns:", val_scaled.columns)
print("Test DataFrame Columns:", test_scaled.columns)



Train DataFrame Columns: Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',
       'CNT_FAM_MEMBERS', 'AGE_YEARS', 'YEARS_EMPLOYED', 'status_mapped'],
      dtype='object')
Validation DataFrame Columns: Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',
       'CNT_FAM_MEMBERS', 'AGE_YEARS', 'YEARS_EMPLOYED', 'status_mapped'],
      dtype='object')
Test DataFrame Columns: Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STAT

In [105]:
# Call the split_data function
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df, val_scaled, test_scaled)

# Run PSO for feature selection (same as previous code)
classifier = KNeighborsClassifier()
n_particles = 30
n_features = X_train.shape[1]
bounds = (0, 1)  # Binary feature selection

positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness = initialize_swarm(
    n_particles, n_features, bounds, fitness_function, X_train, y_train, classifier
)

# Track the PSO process using tqdm
for _ in tqdm(range(100), desc="PSO Iterations"):  # Max iterations
    positions, velocities, pbest_positions, pbest_fitness, gbest_position = update_swarm(
        positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position,
        w=0.5, c1=1.5, c2=1.5, bounds=bounds, X=X_train, y=y_train, classifier=classifier
    )

pso_result = return_best_solution(gbest_position, gbest_fitness, X_train.columns)
selected_features_pso = pso_result["Selected Features"]
print("PSO Best Features:", selected_features_pso)
print("PSO Best Fitness (Accuracy):", pso_result["Best Fitness"])

# Filter training, validation, and test sets by PSO-selected features
X_train_selected_pso = X_train[selected_features_pso]
X_val_selected_pso = X_val[selected_features_pso]
X_test_selected_pso = X_test[selected_features_pso]

# Train and evaluate KNN with PSO-selected features
best_k, best_score = find_optimal_k(X_train_selected_pso, y_train)
print(f"Optimal k (PSO): {best_k} with cross-validation accuracy: {best_score}")

knn_model_pso = KNeighborsClassifier(n_neighbors=best_k)
knn_model_pso.fit(X_train_selected_pso, y_train)
y_pred_pso = knn_model_pso.predict(X_val_selected_pso)
accuracy_pso = accuracy_score(y_val, y_pred_pso)
print(f"Accuracy of KNN with PSO-selected features and k={best_k}: {accuracy_pso}")

PSO Iterations: 100%|██████████| 100/100 [27:01<00:00, 16.21s/it]


PSO Best Features: ['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_FAMILY_STATUS', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'AGE_YEARS']
PSO Best Fitness (Accuracy): 0.9500554938956715
Optimal k (PSO): 21 with cross-validation accuracy: 0.9498712403624945
Accuracy of KNN with PSO-selected features and k=21: 0.9498549523414836


In [113]:
# Separate features and targets
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df, val_scaled, test_scaled)

# Run Genetic Algorithm for feature selection (without passing best_k)
ga_best_solution, ga_best_fitness = genetic_algorithm(X_train, y_train, X_val, y_val, num_features=X_train.shape[1])

# Print the best solution from the genetic algorithm
print("GA Best Solution:", ga_best_solution)

# Get the names of the selected features (those with a 1 in ga_best_solution)
ga_selected_columns = [X_train.columns[i] for i, bit in enumerate(ga_best_solution) if bit == 1]

# Print the selected columns
print("GA Selected Columns:", ga_selected_columns)

# Filter the training, validation, and test sets by GA-selected features using column names
X_train_selected_ga = X_train[ga_selected_columns]
X_val_selected_ga = X_val[ga_selected_columns]
X_test_selected_ga = X_test[ga_selected_columns]

# Convert to numpy arrays if needed
X_train_selected_ga = X_train_selected_ga.values
X_val_selected_ga = X_val_selected_ga.values
X_test_selected_ga = X_test_selected_ga.values

# Print the GA selected features
print("GA Selected Features:", ga_selected_columns)
print("GA Best Fitness (Accuracy):", ga_best_fitness)

# Find optimal k for KNN after feature selection
best_k, best_score = find_optimal_k(X_train_selected_ga, y_train)
print(f"Optimal k (GA): {best_k} with cross-validation accuracy: {best_score}")

# Train the KNN model and display the accuracy
train_knn_and_display_accuracy(X_train_selected_ga, y_train, X_val_selected_ga, y_val, X_test_selected_ga, y_test, best_k)

GA Best Solution: [1 1 0 1 1 0 0 1 1 0 0 1 1 0 1 0 1]
GA Selected Columns: ['CODE_GENDER', 'FLAG_OWN_CAR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'YEARS_EMPLOYED']
GA Selected Features: ['CODE_GENDER', 'FLAG_OWN_CAR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'YEARS_EMPLOYED']
GA Best Fitness (Accuracy): 0.950269374222959
Optimal k (GA): 27 with cross-validation accuracy: 0.9498712403624945
Validation Accuracy: 0.9498549523414836
Test Accuracy: 0.9498653407913819


In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Random Forest

In [106]:
from tqdm import tqdm
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Function to find the optimal number of estimators for Random Forest
def find_optimal_estimators(X_train, y_train):
    param_grid = {'n_estimators': list(range(10, 201, 10))}
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_['n_estimators'], grid_search.best_score_

# Function to split the data
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df,val_scaled,test_scaled)

# PSO for feature selection (placeholder functions for initialization and update)
classifier = RandomForestClassifier(random_state=42)
n_particles = 30
n_features = X_train.shape[1]
bounds = (0, 1)  # Binary feature selection

positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness = initialize_swarm(
    n_particles, n_features, bounds, fitness_function, X_train, y_train, classifier
)

# Track the PSO process using tqdm
for _ in tqdm(range(100), desc="PSO Iterations"):
    positions, velocities, pbest_positions, pbest_fitness, gbest_position = update_swarm(
        positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position,
        w=0.5, c1=1.5, c2=1.5, bounds=bounds, X=X_train, y=y_train, classifier=classifier
    )

pso_result = return_best_solution(gbest_position, gbest_fitness, df.drop(columns=['status_mapped']).columns)
selected_features_pso = pso_result["Selected Features"]
print("PSO Best Features:", selected_features_pso)
print("PSO Best Fitness (Accuracy):", pso_result["Best Fitness"])

# Filter training, validation, and test sets by PSO-selected features
X_train_selected_pso = X_train[:, gbest_position.astype(bool)]
X_val_selected_pso = X_val[:, gbest_position.astype(bool)]
X_test_selected_pso = X_test[:, gbest_position.astype(bool)]

# Train and evaluate Random Forest with PSO-selected features
best_estimators, best_score = find_optimal_estimators(X_train_selected_pso, y_train)
print(f"Optimal number of estimators (PSO): {best_estimators} with cross-validation accuracy: {best_score}")

rf_model_pso = RandomForestClassifier(n_estimators=best_estimators, random_state=42)
rf_model_pso.fit(X_train_selected_pso, y_train)
y_pred_pso = rf_model_pso.predict(X_val_selected_pso)
accuracy_pso = accuracy_score(y_val, y_pred_pso)
print(f"Accuracy of Random Forest with PSO-selected features: {accuracy_pso}")

# Genetic Algorithm (GA) for feature selection
# Placeholder genetic_algorithm function
# ga_best_solution, ga_best_fitness = genetic_algorithm(X_train, y_train, X_val, y_val, num_features=X_train.shape[1])
# ga_selected_features = [df.drop(columns=['status_mapped']).columns[i] for i, bit in enumerate(ga_best_solution) if bit == 1]
# print("GA Best Features:", ga_selected_features)
# print("GA Best Fitness (Accuracy):", ga_best_fitness)

# # Filter training, validation, and test sets by GA-selected features
# X_train_selected_ga = X_train[:, ga_best_solution.astype(bool)]
# X_val_selected_ga = X_val[:, ga_best_solution.astype(bool)]
# X_test_selected_ga = X_test[:, ga_best_solution.astype(bool)]

# # Train and evaluate Random Forest with GA-selected features
# best_estimators, best_score = find_optimal_estimators(X_train_selected_ga, y_train)
# print(f"Optimal number of estimators (GA): {best_estimators} with cross-validation accuracy: {best_score}")

# rf_model_ga = RandomForestClassifier(n_estimators=best_estimators, random_state=42)
# rf_model_ga.fit(X_train_selected_ga, y_train)
# y_pred_ga = rf_model_ga.predict(X_val_selected_ga)
# accuracy_ga = accuracy_score(y_val, y_pred_ga)
# print(f"Accuracy of Random Forest with GA-selected features: {accuracy_ga}")

# # Save the datasets as CSV
# train_df = pd.concat([pd.DataFrame(X_train), pd.Series(y_train, name='status_mapped')], axis=1)
# val_df = pd.concat([pd.DataFrame(X_val), pd.Series(y_val, name='status_mapped')], axis=1)
# test_df = pd.concat([pd.DataFrame(X_test), pd.Series(y_test, name='status_mapped')], axis=1)

# train_df.to_csv('/content/drive/MyDrive/Ai project/train_rf.csv', index=False)
# val_df.to_csv('/content/drive/MyDrive/Ai project/validation_rf.csv', index=False)
# test_df.to_csv('/content/drive/MyDrive/Ai project/test_rf.csv', index=False)

# print("Datasets saved successfully:")
# print(f"Train set: {train_df.shape}")
# print(f"Validation set: {val_df.shape}")
# print(f"Test set: {test_df.shape}")


PSO Iterations: 100%|██████████| 100/100 [40:03<00:00, 24.04s/it]

PSO Best Features: ['FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'CNT_FAM_MEMBERS', 'AGE_YEARS']
PSO Best Fitness (Accuracy): 0.9489456159822419





InvalidIndexError: (slice(None, None, None), array([False, False,  True,  True,  True, False, False, False, False,
       False, False, False, False,  True,  True, False, False, False]))

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Function to find the optimal C and kernel for SVM
def find_optimal_svm(X_train, y_train):
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_, grid_search.best_score_

# Load the data
df = pd.read_csv('/content/drive/MyDrive/Ai project/Feature_eng_updated.csv')

# Define features and target
X = df.drop(columns=['status_mapped']).values  # Features as numpy array
y = df['status_mapped'].values                # Target (label)

# Split data
X_train, X_val, X_test, y_train, y_val, y_test = separate_features_targets(train_scaled_df,val_scaled,test_scaled)

# Run PSO for feature selection
classifier = SVC()
n_particles = 30
n_features = X_train.shape[1]
bounds = (0, 1)  # Binary feature selection

positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness = initialize_swarm(
    n_particles, n_features, bounds, fitness_function, X_train, y_train, classifier
)

# Track the PSO process using tqdm
for _ in tqdm(range(100), desc="PSO Iterations"):  # Max iterations
    positions, velocities, pbest_positions, pbest_fitness, gbest_position = update_swarm(
        positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position,
        w=0.5, c1=1.5, c2=1.5, bounds=bounds, X=X_train, y=y_train, classifier=classifier
    )

pso_result = return_best_solution(gbest_position, gbest_fitness, df.drop(columns=['status_mapped']).columns)
selected_features_pso = pso_result["Selected Features"]
print("PSO Best Features:", selected_features_pso)
print("PSO Best Fitness (Accuracy):", pso_result["Best Fitness"])

# Filter training, validation, and test sets by PSO-selected features
X_train_selected_pso = X_train[:, gbest_position.astype(bool)]
X_val_selected_pso = X_val[:, gbest_position.astype(bool)]
X_test_selected_pso = X_test[:, gbest_position.astype(bool)]

# Find optimal parameters for SVM after PSO feature selection
best_params, best_score = find_optimal_svm(X_train_selected_pso, y_train)
print(f"Optimal parameters (PSO): {best_params} with cross-validation accuracy: {best_score}")

# Train and evaluate SVM with PSO-selected features
svm_model_pso = SVC(**best_params)
svm_model_pso.fit(X_train_selected_pso, y_train)
y_pred_pso = svm_model_pso.predict(X_val_selected_pso)
accuracy_pso = accuracy_score(y_val, y_pred_pso)
print(f"Accuracy of SVM with PSO-selected features: {accuracy_pso}")

# # Run Genetic Algorithm for feature selection (without passing best_params)
# ga_best_solution, ga_best_fitness = genetic_algorithm(X_train, y_train, X_val, y_val, num_features=X_train.shape[1])
# ga_selected_features = [df.drop(columns=['status_mapped']).columns[i] for i, bit in enumerate(ga_best_solution) if bit == 1]
# print("GA Best Features:", ga_selected_features)
# print("GA Best Fitness (Accuracy):", ga_best_fitness)

# # Filter training, validation, and test sets by GA-selected features
# X_train_selected_ga = X_train[:, ga_best_solution.astype(bool)]
# X_val_selected_ga = X_val[:, ga_best_solution.astype(bool)]
# X_test_selected_ga = X_test[:, ga_best_solution.astype(bool)]

# # Find optimal parameters for SVM after GA feature selection
# best_params, best_score = find_optimal_svm(X_train_selected_ga, y_train)
# print(f"Optimal parameters (GA): {best_params} with cross-validation accuracy: {best_score}")

# # Train and evaluate SVM with GA-selected features
# svm_model_ga = SVC(**best_params)
# svm_model_ga.fit(X_train_selected_ga, y_train)
# y_pred_ga = svm_model_ga.predict(X_val_selected_ga)
# accuracy_ga = accuracy_score(y_val, y_pred_ga)
# print(f"Accuracy of SVM with GA-selected features: {accuracy_ga}")

# # Save the splits as CSV files (if needed, combine X and y for each split)
# train_df = pd.concat([X_train, y_train], axis=1)
# val_df = pd.concat([X_val, y_val], axis=1)
# test_df = pd.concat([X_test, y_test], axis=1)

# train_df.to_csv('/content/drive/MyDrive/Ai project/train_svm.csv', index=False)
# val_df.to_csv('/content/drive/MyDrive/Ai project/validation_svm.csv', index=False)
# test_df.to_csv('/content/drive/MyDrive/Ai project/test_svm.csv', index=False)

# print("Datasets saved successfully:")
# print(f"Train set: {train_df.shape}")
# print(f"Validation set: {val_df.shape}")
# print(f"Test set: {test_df.shape}")


PSO Iterations: 100%|██████████| 100/100 [1:43:54<00:00, 62.35s/it]


PSO Best Features: ['CODE_GENDER', 'FLAG_OWN_CAR', 'AGE_YEARS', 'YEARS_EMPLOYED']
PSO Best Fitness (Accuracy): 0.948723640399556
