In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor

# Generate a dataset
n_samples = 1000
n_features = 10
n_informative = 3
X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative)

# Convert the X array into a DataFrame (use this DataFrame to fit the model)
col_names = ["X{}".format(i+1) for i in range(n_features)]
df = pd.DataFrame(X, columns=col_names)
df["Y"] = y

# Display DataFrame (if needed for testing)
display(df)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y
0,0.715019,-1.565014,-0.165615,0.900508,1.302427,-0.236386,-0.164163,0.604435,-0.360219,0.835613,32.136798
1,-1.783988,-1.978479,0.555404,-0.331240,-0.092773,-0.195862,-0.887923,-0.028809,0.107099,0.084620,-14.718383
2,1.875255,0.229588,-0.971852,-0.193137,-0.032040,-1.136523,-2.242222,-0.964312,-0.786889,0.665141,-93.833961
3,-0.439052,-1.108884,-1.937356,1.230193,1.146935,0.230574,0.173379,-0.189298,-0.315737,-1.354558,61.620051
4,-1.509617,0.767228,-0.345743,1.404475,0.071724,1.551719,1.749256,0.072788,0.682219,2.373161,174.790214
...,...,...,...,...,...,...,...,...,...,...,...
995,0.651686,-1.736566,-0.057523,0.857202,-1.596733,0.568446,-0.568879,0.629521,0.161868,-2.258053,80.699873
996,1.668564,0.298276,-0.532765,0.771725,-0.294683,-0.550490,-0.466688,-0.688026,-1.614010,0.352831,-102.578489
997,-0.192388,-1.091729,0.089581,0.039461,0.656008,0.321731,-0.617106,0.367078,0.254887,0.217939,28.127603
998,-1.628421,-0.974269,-0.387703,1.826842,-0.509658,1.839048,-0.446793,-0.550021,-0.714172,-2.412568,66.803988


In [2]:
# Initialize the RandomForestRegressor model
model = RandomForestRegressor()
model.fit(df.drop("Y", axis=1), y)  # Train using the DataFrame with feature names

In [3]:
def generate_min_max_population(df, constraints, generation_size):
    # Extract features information
    features_nb = df.shape[1]
    features_names = df.columns
    df_min_max = df.describe().loc[["min", "max"], :]
    
    # Initialize new population DataFrame with zeros
    new_population = pd.DataFrame(np.zeros((generation_size, features_nb)), columns=features_names)
    
    # For each feature, create a randomized array with a uniform distribution between the min and max
    for column_name in features_names:
        if column_name in constraints.index:
            new_population[column_name] = np.ones(generation_size) * constraints["constrained_feature_value"].loc[column_name]
        else:
            new_population[column_name] = np.random.uniform(df_min_max.loc["min", column_name], df_min_max.loc["max", column_name], generation_size)
    
    return new_population

In [4]:
def min_max_select(constraints, population_in, features_names, generation_size, population_out_size, target, model):
    # Create a new generation based on input population characteristics
    new_generation = generate_min_max_population(population_in, constraints, generation_size)
    
    # Concatenate new and original population
    new_generation = pd.concat([new_generation, population_in], ignore_index=True)
    
    # Predict Y and calculate distance from the target
    new_generation["Y"] = model.predict(new_generation)
    new_generation["target_distance"] = abs(new_generation["Y"] - target)
    
    # Sort individuals based on distance from the target and keep the best
    new_generation = new_generation.sort_values(by="target_distance").head(population_out_size)
    
    return new_generation

In [5]:
# Define constraints
constraints = pd.DataFrame({'constrained_feature': ["X1", "X3"], 'constrained_feature_value': [-1, 4]}).set_index("constrained_feature")

# Define generation settings
generation_size = 100
population_out_size = 10
generation_nb = 5000
target = 42

# Initialize first population
starting_population = generate_min_max_population(df.drop("Y", axis=1), constraints, generation_size)
features_names = starting_population.columns

# Initialize variables to track progress
individuals_nb = 0
memory = 100

# Create a DataFrame to record results
results_min_max = pd.DataFrame(np.zeros((generation_nb, 3)), columns=["min_target_distance", "individuals_nb", "time_elapsed_min_max"])

# Start timing
start_timer = time.time()

# Run the generational loop
for i in range(generation_nb):
    # Use starting or previous population
    if i == 0:
        population_in = starting_population
    else:
        population_in = population_out.drop(["Y", "target_distance"], axis=1)
    
    # Create a new generation and return the best individuals
    population_out = min_max_select(constraints, population_in, features_names, generation_size, population_out_size, target, model)
    
    # Record the minimum target distance
    current_min = population_out.iloc[0, population_out.shape[1] - 1]
    results_min_max.loc[i, "min_target_distance"] = current_min
    
    # Increment the total number of individuals created and record it
    individuals_nb += generation_size
    results_min_max.loc[i, "individuals_nb"] = individuals_nb
    
    # Record time elapsed
    results_min_max.loc[i, "time_elapsed_min_max"] = float(time.time() - start_timer)
    
    # Display improvement if found
    if current_min < memory:
        memory = current_min
        print(i, ":", memory)

0 : 0.6791328725774548
8 : 0.2590384692447145
22 : 0.002380984579687606
259 : 0.001575040804553396
264 : 0.0002368274625936806
817 : 0.0001576612728939608
3047 : 0.00012953314961805518
3464 : 4.288313548528322e-05
