In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.datasets import make_regression

n_samples = 1000
n_features = 10
n_informative = 3

X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative)

# We convert the X and y arrays into a DataFrame which will be useful later
col_names = []
for i in range(n_features):
    col_names.append("X{}".format(i+1))

df = pd.DataFrame(X, columns=col_names)
df["Y"] = y
display(df)

In [None]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor()
RFR.fit(X, y)

In [None]:
def generate_min_max_population(df, constraints, generation_size):

  # The names, min-max and number of features are extracted from the DataFrame 
  features_nb = df.shape[1]
  features_names = df.columns
  df_min_max = df.describe().loc[["min","max"],:]

  # We initialize the new population DataFrame with zeros
  new_population = pd.DataFrame(np.zeros((generation_size,features_nb)), columns=features_names)
  
  # For each feature, we create a randomized array with a uniform distribution
  # between the observed min and max of the feature, except for the constrained
  # ones where the value is unique

  for column_name in features_names:
    if column_name in constraints.index:
      new_population[column_name] = np.ones(generation_size)*constraints["constrained_feature_value"].loc[column_name]
      
    else:  
      new_population[column_name] = np.random.uniform(df_min_max.loc["min",column_name], df_min_max.loc["max",column_name], generation_size)

  return new_population

In [None]:
def min_max_select(constraints, population_in, features_names, generation_size, population_out_size, target, model):
  
  # We create a new generation, based on the input population characteristics
  new_generation = generate_min_max_population(population_in, constraints, generation_size)

  # We append the original population to the new generation to keep the best
  # individuals of these two DataFrames
  new_generation = new_generation.append(population_in, ignore_index=True)
  
  # We calculate Y thanks to the model and the distance from target
  new_generation["Y"]= model.predict(new_generation)
  new_generation["target_distance"]=abs(new_generation["Y"]-np.ones(new_generation.shape[0])*target)
  
  # We sort individuals according to their distance from the target and
  # keep only the desired number of individuals
  new_generation = new_generation.sort_values(by="target_distance").head(population_out_size)

  return new_generation

In [None]:
# We might use this array to set values on specific features
constraints = pd.DataFrame({'constrained_feature': ["X1", "X3"], 'constrained_feature_value': [-1, 4]}).set_index("constrained_feature")

# We define the number of individuals at each generation and the selected number
generation_size = 100
population_out_size = 10

# We initiate the 1st population, based on the original dataset features
starting_population = generate_min_max_population(df.drop("Y", axis=1), constraints, generation_size)
features_names = starting_population.columns

# Target is set
target = 42

# We set the number of successive generations
generation_nb = 5000

# We set a variable to record the total number of individuals reached at each stage
individuals_nb = 0

# We define a variable to record every improvement on the target distance
memory  = 100

# We create a DataFrame to record the min target from distance at each iteration
results_min_max = pd.DataFrame(np.zeros((generation_nb,3)), columns=["min_target_distance","individuals_nb","time_elapsed_min_max"])

start_timer = time.time()

for i in range(generation_nb):
  
  # We either initiate the loop with the starting or previous population
  if i==0:
    population_in = starting_population
  else:
    population_in = population_out.drop(["Y","target_distance"], axis=1)
  
  # A new generation is created and only the best individuals are returned
  population_out = min_max_select(constraints, population_in, features_names, generation_size, population_out_size, target, model)

  # The current minimum distance from target is set and recorded
  current_min = population_out.iloc[0,population_out.shape[1]-1]
  results_min_max.loc[i,"min_target_distance"] = current_min

  # The incremental number of individuals created is calculated and recorded
  individuals_nb+=generation_size
  results_min_max.loc[i,"individuals_nb"] = individuals_nb


  results_min_max.loc[i,"time_elapsed_min_max"] = float(time.time()-start_timer)

  # In case there is a improvement on the minimum distance, we display it
  if current_min<memory:
    memory = current_min
    print(i, ":", memory)
  i+=1