<a href="https://colab.research.google.com/github/shahdcode/Credit-Card-Approval-Prediction/blob/main/Genetic_Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


## Particle Swarm Optimization (PSO)

In [49]:
def fitness_function(features, X, y, classifier):
    """
    Evaluate the fitness of a particle.

    Args:
        features (array): Binary array representing selected features.
        X (ndarray): Feature matrix.
        y (ndarray): Target array.
        classifier: A machine learning model (e.g., DecisionTreeClassifier).

    Returns:
        float: The fitness value (lower is better for minimization tasks).
    """
    # Select features based on the binary array
    selected_features = X[:, features == 1]

    # If no features are selected, return a high fitness value
    if selected_features.shape[1] == 0:
        return float('inf')

    # Perform a simple train-test split (80%-20%)
    X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.2, random_state=42)

    # Train the classifier
    classifier.fit(X_train, y_train)

    # Predict and calculate accuracy
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Return the negative accuracy as fitness (minimization task)
    return -accuracy







In [50]:
def initialize_swarm(n_particles, n_features, bounds, fitness_function, X, y, classifier):
    """
    Initialize the swarm for PSO.

    Args:
        n_particles (int): Number of particles.
        n_features (int): Number of features.
        bounds (tuple): Bounds for the feature mask (binary: 0 or 1).
        fitness_function (callable): The fitness function.
        X (ndarray): Feature matrix.
        y (ndarray): Target array.
        classifier: A machine learning model.

    Returns:
        positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness
    """
    lower_bounds, upper_bounds = bounds

    # Randomly initialize positions (binary: 0 or 1) and velocities
    positions = np.random.randint(lower_bounds, upper_bounds + 1, size=(n_particles, n_features))
    velocities = np.random.uniform(-1, 1, (n_particles, n_features))

    # Evaluate initial fitness
    fitness = np.array([fitness_function(positions[i], X, y, classifier) for i in range(n_particles)])

    # Set personal bests (pbest)
    pbest_positions = positions.copy()
    pbest_fitness = fitness.copy()

    # Set global best (gbest)
    gbest_index = np.argmin(pbest_fitness)
    gbest_position = pbest_positions[gbest_index]
    gbest_fitness = pbest_fitness[gbest_index]

    return positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness


In [51]:
def return_best_solution(gbest_position, gbest_fitness, feature_names):
    """
    Return the best solution found by the swarm.

    Args:
        gbest_position (array): The best feature mask.
        gbest_fitness (float): The best fitness value.
        feature_names (list): List of feature names.

    Returns:
        dict: A dictionary containing selected features and their fitness.
    """
    selected_features = [feature_names[i] for i in range(len(gbest_position)) if gbest_position[i] == 1]
    return {
        "Selected Features": selected_features,
        "Best Fitness": -gbest_fitness  # Accuracy is the positive value of fitness
    }


In [52]:
# Shahd
def update_swarm(positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position, w, c1, c2, bounds, X, y, classifier):
    lower_bounds, upper_bounds = bounds
    n_particles, dimensions = positions.shape

    # Temporarily cast positions to float64 for update step
    positions = positions.astype(np.float64)

    for i in range(n_particles):
        # Update velocity
        inertia = w * velocities[i]
        cognitive = c1 * np.random.random() * (pbest_positions[i] - positions[i])
        social = c2 * np.random.random() * (gbest_position - positions[i])
        velocities[i] = inertia + cognitive + social

        # Update position
        positions[i] += velocities[i]

        # Ensure positions stay within bounds (binary: 0 or 1)
        positions[i] = np.clip(positions[i], lower_bounds, upper_bounds)

    # After update, cast positions back to int64 for binary (0, 1)
    positions = np.round(positions).astype(np.int64)

    # Evaluate fitness
    fitness = np.array([fitness_function(positions[i], X, y, classifier) for i in range(n_particles)])

    # Update pbest
    for i in range(n_particles):
        if fitness[i] < pbest_fitness[i]:
            pbest_fitness[i] = fitness[i]
            pbest_positions[i] = positions[i]

    # Update gbest
    gbest_index = np.argmin(fitness)
    if fitness[gbest_index] < np.min(pbest_fitness):
        gbest_position = positions[gbest_index]

    return positions, velocities, pbest_positions, pbest_fitness, gbest_position




## Genetic Algorithm Implementation

In [53]:
#Step 1 - Shahd
def encode_solution_space(num_features):
    """Creates binary encoding for feature selection."""
    return np.random.choice([0, 1], size=num_features)


In [54]:
#Step 2 - Habiba
def set_algorithm_parameters():
    """Set the parameters for the Genetic Algorithm."""
    return {
        "pop_size": 50,
        "num_generations": 100,
        "crossover_rate": 0.8,
        "mutation_rate": 0.01
    }

In [55]:
#Step 3 - Shahd
def create_initial_population(pop_size, num_features):
    """Generates the initial population of chromosomes."""
    return [encode_solution_space(num_features) for _ in range(pop_size)]

In [56]:
#Step 4 - Shahd
def measure_fitness(population, X_train, y_train, X_val, y_val, best_k=5):
    """Evaluates the fitness of each chromosome (feature selection) in the population."""
    fitness_scores = []
    for chromosome in population:
        selected_features = [i for i, bit in enumerate(chromosome) if bit == 1]
        if not selected_features:
            fitness_scores.append(0)
            continue
        X_train_selected = X_train[:, selected_features]
        X_val_selected = X_val[:, selected_features]

        # Train the model with the default value for k (e.g., k=5)
        model = KNeighborsClassifier(n_neighbors=best_k)
        model.fit(X_train_selected, y_train)
        predictions = model.predict(X_val_selected)
        fitness_scores.append(accuracy_score(y_val, predictions))
    return fitness_scores


In [57]:
#Step 5 - Habiba
def select_parents(population, fitness_values):
    """Select parents using Tournament Selection."""
    parents = []
    for _ in range(len(population)):
        # Tournament selection: Randomly pick 3 and select the best
        candidates_idx = np.random.choice(len(population), 3, replace=False)
        best_candidate = max(candidates_idx, key=lambda idx: fitness_values[idx])
        parents.append(population[best_candidate])
    return np.array(parents)


In [58]:
#Step 6 - Habiba
def crossover(parents, crossover_rate):
    """Apply single-point crossover."""
    offspring = []
    for i in range(0, len(parents), 2):
        parent1, parent2 = parents[i], parents[(i + 1) % len(parents)]
        if np.random.rand() < crossover_rate:
            point = np.random.randint(1, len(parent1))
            child1 = np.concatenate([parent1[:point], parent2[point:]])
            child2 = np.concatenate([parent2[:point], parent1[point:]])
            offspring.extend([child1, child2])
        else:
            offspring.extend([parent1, parent2])
    return np.array(offspring)


In [59]:
#Step 7 - Habiba
def populate_next_generation(offspring, fitness_values):
    """Replace population with offspring and apply elitism."""
    elite_idx = np.argmax(fitness_values)
    next_generation = offspring
    next_generation[0] = offspring[elite_idx]  # Ensure elite survives
    return next_generation

In [60]:
#Step 8 - Habiba
def mutate(offspring, mutation_rate):
    """Apply bit-flip mutation."""
    for i in range(len(offspring)):
        for j in range(len(offspring[i])):
            if np.random.rand() < mutation_rate:
                offspring[i][j] = 1 - offspring[i][j]  # Flip bit
    return offspring


In [61]:
#Step 9 - Shahd
def check_stopping_condition(generation, max_generations, fitness_scores, threshold, no_change_limit, previous_scores):
    """Checks if the algorithm should stop based on fitness, generations, or stagnation."""
    if generation >= max_generations or max(fitness_scores) >= threshold:
        return True
    if len(previous_scores) >= no_change_limit and all(score == previous_scores[0] for score in previous_scores):
        return True
    return False


In [62]:
def genetic_algorithm(X_train, y_train, X_val, y_val, num_features):
    """Runs the genetic algorithm for feature selection."""
    params = set_algorithm_parameters()
    params['no_change_limit'] = 10  # Define stagnation limit
    population = create_initial_population(params['pop_size'], num_features)
    previous_scores = []

    for generation in range(params['num_generations']):
        fitness_scores = measure_fitness(population, X_train, y_train, X_val, y_val)

        if check_stopping_condition(generation, params['num_generations'], fitness_scores, 0.95, params['no_change_limit'], previous_scores):
            break

        previous_scores.append(max(fitness_scores))
        if len(previous_scores) > params['no_change_limit']:
            previous_scores.pop(0)

        parents = select_parents(population, fitness_scores)
        offspring = crossover(parents, params['crossover_rate'])
        offspring = mutate(offspring, params['mutation_rate'])
        population = populate_next_generation(offspring, fitness_scores)

    best_solution = population[np.argmax(fitness_scores)]
    return best_solution, max(fitness_scores)  # Return best solution and fitness score


## Knn

In [63]:
def find_optimal_k(X_train, y_train):
    from sklearn.model_selection import GridSearchCV

    param_grid = {'n_neighbors': list(range(1, 31))}
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_['n_neighbors'], grid_search.best_score_


In [66]:
#dropping 7agat me4 mohema

dataset_path = '/content/drive/MyDrive/Ai project/Feature_eng.csv'
df = pd.read_csv(dataset_path)

# Drop specific columns
df = df.drop(columns=['ID', 'FLAG_MOBIL', 'STATUS'])

# Save the updated DataFrame back to CSV
df.to_csv('/content/drive/MyDrive/Ai project/Feature_eng_updated.csv', index=False)





In [64]:
from sklearn.model_selection import train_test_split

def split_data(X, y, test_size=0.3, val_size=0.5, random_state=42):
    """
    Function to split the dataset into training, validation, and test sets.

    Parameters:
    - X: Features
    - y: Target labels
    - test_size: Proportion of data to allocate to the test set
    - val_size: Proportion of data to allocate to the validation set (after splitting the test set)
    - random_state: Random seed for reproducibility

    Returns:
    - X_train, X_val, X_test, y_train, y_val, y_test: The split datasets
    """
    # Step 1: Split into 70% training and 30% temp (which will be further split into validation and test)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

    # Step 2: Split temp into validation and testing
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=val_size, stratify=y_temp, random_state=random_state)

    return X_train, X_val, X_test, y_train, y_val, y_test


In [67]:
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score



# Load the data

# Drop unnecessary columns and define features/target
X = df.drop(columns=['status_mapped']).values  # Features as numpy array
y = df['status_mapped'].values                # Target (label)

# Call the split_data function
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

# Run PSO for feature selection (same as previous code)
classifier = KNeighborsClassifier()
n_particles = 30
n_features = X_train.shape[1]
bounds = (0, 1)  # Binary feature selection

positions, velocities, pbest_positions, pbest_fitness, gbest_position, gbest_fitness = initialize_swarm(
    n_particles, n_features, bounds, fitness_function, X_train, y_train, classifier
)

# Track the PSO process using tqdm
for _ in tqdm(range(100), desc="PSO Iterations"):  # Max iterations
    positions, velocities, pbest_positions, pbest_fitness, gbest_position = update_swarm(
        positions, velocities, fitness_function, pbest_positions, pbest_fitness, gbest_position,
        w=0.5, c1=1.5, c2=1.5, bounds=bounds, X=X_train, y=y_train, classifier=classifier
    )

pso_result = return_best_solution(gbest_position, gbest_fitness, df.drop(columns=['status_mapped']).columns)
selected_features_pso = pso_result["Selected Features"]
print("PSO Best Features:", selected_features_pso)
print("PSO Best Fitness (Accuracy):", pso_result["Best Fitness"])

# Filter training, validation, and test sets by PSO-selected features
X_train_selected_pso = X_train[:, gbest_position.astype(bool)]
X_val_selected_pso = X_val[:, gbest_position.astype(bool)]
X_test_selected_pso = X_test[:, gbest_position.astype(bool)]

# Train and evaluate KNN with PSO-selected features
best_k, best_score = find_optimal_k(X_train_selected_pso, y_train)
print(f"Optimal k (PSO): {best_k} with cross-validation accuracy: {best_score}")

knn_model_pso = KNeighborsClassifier(n_neighbors=best_k)
knn_model_pso.fit(X_train_selected_pso, y_train)
y_pred_pso = knn_model_pso.predict(X_val_selected_pso)
accuracy_pso = accuracy_score(y_val, y_pred_pso)
print(f"Accuracy of KNN with PSO-selected features and k={best_k}: {accuracy_pso}")


PSO Iterations: 100%|██████████| 100/100 [19:32<00:00, 11.73s/it]


PSO Best Features: ['CODE_GENDER', 'FLAG_OWN_CAR', 'CNT_CHILDREN', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_PHONE', 'CNT_FAM_MEMBERS', 'AGE_YEARS']
PSO Best Fitness (Accuracy): 0.9491675915649278
Optimal k (PSO): 15 with cross-validation accuracy: 0.9502264308652514
Accuracy of KNN with PSO-selected features and k=15: 0.9496477414007459


In [44]:
# Call the split_data function to split the dataset
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

# Run Genetic Algorithm for feature selection (without passing best_k)
ga_best_solution, ga_best_fitness = genetic_algorithm(X_train, y_train, X_val, y_val, num_features=X_train.shape[1])
ga_selected_features = [df.drop(columns=['status_mapped']).columns[i] for i, bit in enumerate(ga_best_solution) if bit == 1]
print("GA Best Features:", ga_selected_features)
print("GA Best Fitness (Accuracy):", ga_best_fitness)

# Filter training, validation, and test sets by GA-selected features
X_train_selected_ga = X_train[:, ga_best_solution.astype(bool)]
X_val_selected_ga = X_val[:, ga_best_solution.astype(bool)]
X_test_selected_ga = X_test[:, ga_best_solution.astype(bool)]

# Find optimal k for KNN after feature selection
best_k, best_score = find_optimal_k(X_train_selected_ga, y_train)
print(f"Optimal k (GA): {best_k} with cross-validation accuracy: {best_score}")

# Train and evaluate KNN with GA-selected features
knn_model_ga = KNeighborsClassifier(n_neighbors=best_k)
knn_model_ga.fit(X_train_selected_ga, y_train)
y_pred_ga = knn_model_ga.predict(X_val_selected_ga)
accuracy_ga = accuracy_score(y_val, y_pred_ga)
print(f"Accuracy of KNN with GA-selected features and k={best_k}: {accuracy_ga}")


GA Best Features: ['CODE_GENDER', 'FLAG_OWN_REALTY', 'NAME_HOUSING_TYPE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'YEARS_EMPLOYED']
GA Best Fitness (Accuracy): 0.9508910070451719
Optimal k (GA): 7 with cross-validation accuracy: 0.9503152112415059
Accuracy of KNN with GA-selected features and k=7: 0.9488188976377953


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Load the data
df = pd.read_csv('/content/drive/MyDrive/Ai project/Feature_eng.csv')

df = df.drop('STATUS', axis=1)

# Define features and target
X = df.drop(columns=['status_mapped'])  # Features (all columns except target)
y = df['status_mapped']                # Target (label)

# Split the data into 70% training and 30% remaining (for validation + testing)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Split the remaining 30% into 15% validation and 15% testing
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Save the splits as CSV files (if needed, combine X and y for each split)
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

train_df.to_csv('/content/drive/MyDrive/Ai project/train.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/Ai project/validation.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/Ai project/test.csv', index=False)

print("Datasets saved successfully:")
print(f"Train set: {train_df.shape}")
print(f"Validation set: {val_df.shape}")
print(f"Test set: {test_df.shape}")



Datasets saved successfully:
Train set: (22522, 19)
Validation set: (4826, 19)
Test set: (4827, 19)
