In [14]:
import pandas as pd
import numpy as np
import time

from sklearn.datasets import make_regression

n_samples = 1000
n_features = 10
n_informative = 3

X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative)

# We convert the X and y arrays into a DataFrame which will be useful later
col_names = []
for i in range(n_features):
    col_names.append("X{}".format(i+1))

df = pd.DataFrame(X, columns=col_names)
df["Y"] = y
display(df)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y
0,2.166568,0.306600,1.164731,-0.118690,-0.598748,-0.915594,-0.697544,-0.602705,-0.448813,-0.749132,62.675302
1,0.339379,1.208587,-0.866668,0.266904,-0.290361,1.680114,-2.680724,0.515430,1.013445,0.091351,44.916188
2,-0.538543,-0.225674,0.852090,0.346572,-0.955672,-1.049862,-1.126194,1.085932,0.711177,-0.575684,-15.004899
3,-1.092683,-1.439832,0.558217,2.480020,1.345575,-0.486266,0.101746,-0.027774,1.082387,1.600534,-70.559332
4,0.494942,0.264762,0.318807,-1.477602,-0.181350,1.270263,-0.549402,0.678278,0.117029,0.359854,21.364566
...,...,...,...,...,...,...,...,...,...,...,...
995,-0.612218,-0.225366,-0.809277,0.194067,-0.664725,0.319342,-0.058003,1.041640,-0.874373,-0.539955,-24.971077
996,0.368984,0.117346,-1.551310,-0.411349,0.139196,-0.549840,-1.524363,0.711932,0.209960,-0.144607,4.085754
997,1.123653,1.916711,-1.428553,0.165174,-0.539322,-0.914377,0.531047,-0.164127,-0.215430,0.031630,83.417212
998,-1.348147,1.119521,-0.855203,-0.503862,0.069829,-0.044375,-1.908208,1.578624,1.330089,0.418221,5.962430


In [15]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor()
RFR.fit(df[col_names], df["Y"])  # Use the DataFrame with feature names

In [16]:
df_feature_importances = pd.DataFrame(RFR.feature_importances_,
                                      columns=["Importance"],
                                      index=col_names)
df_feature_importances = df_feature_importances.sort_values("Importance", ascending=False)
df_feature_importances

Unnamed: 0,Importance
X2,0.756202
X1,0.22313
X3,0.008289
X8,0.002147
X9,0.001789
X4,0.001764
X5,0.001755
X10,0.001688
X6,0.001659
X7,0.001577


In [17]:
# We store the describe() results inside a dataframe
df_describe = df.describe()
display(df_describe)

# We define the parameters of the virtual population we generate
population_size = 1000
features_names  = df.columns[:-1]
n_features = len(features_names)

# As an example, we assign a constant value for the third most important characteristic
constraint_feature = df_feature_importances.index[2]
constraint_feature_value = round(np.random.uniform(df_describe.loc["min",constraint_feature], df_describe.loc["max",constraint_feature]),3)
print("\n", constraint_feature_value, "is assigned to", constraint_feature,"\n")


# For each feature, we create a randomized array, except for the constrained one where the value is unique
population = pd.DataFrame(np.zeros((population_size,n_features)),
                          columns=features_names)

for column_name in features_names:
  if column_name!= constraint_feature:
    population[column_name] = np.random.uniform(df_describe.loc["min",column_name], df_describe.loc["max",column_name], population_size)
  else:
    population[column_name] = np.ones(population_size)*constraint_feature_value

display(population)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,-0.059948,0.040264,-0.00029,0.032352,-0.061625,0.020143,-0.066585,-0.015021,0.029273,0.022272,0.124746
std,0.986698,1.034461,1.044135,0.995977,0.955584,0.997381,1.027705,1.02118,1.026719,1.014849,43.287834
min,-3.090441,-2.969079,-3.320933,-3.137206,-4.200805,-3.553054,-3.721732,-3.310568,-3.122734,-3.259018,-117.507564
25%,-0.73261,-0.648714,-0.740554,-0.632807,-0.722593,-0.675897,-0.730548,-0.680569,-0.680356,-0.685175,-30.483729
50%,-0.030904,0.05013,-0.003699,-0.000594,-0.042364,-0.001087,-0.079885,-0.007387,0.030977,-0.013746,-0.86997
75%,0.642672,0.748492,0.741829,0.682644,0.595969,0.661638,0.599105,0.65072,0.722969,0.713642,30.804678
max,2.831511,3.618743,3.612301,3.150826,2.828225,3.612931,3.918072,3.190098,3.439904,2.955383,143.623781



 2.205 is assigned to X3 



Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10
0,-1.628114,0.214818,2.205,0.267626,0.345327,1.692819,2.025580,-2.637219,-1.790029,-1.218535
1,-2.894331,-0.430101,2.205,2.069404,-1.607556,2.855902,-0.576625,-2.371190,-0.456381,1.832598
2,-2.751134,0.103683,2.205,0.758191,-4.055377,0.737091,2.717131,-1.008545,2.778488,-2.170113
3,-1.233659,0.455716,2.205,-2.937998,1.896922,-2.525832,-1.699869,2.086461,-2.779818,-2.067288
4,2.484953,-1.941905,2.205,1.204359,1.151090,-1.464077,-1.670803,0.399511,-2.030192,2.674985
...,...,...,...,...,...,...,...,...,...,...
995,-1.890082,-1.484431,2.205,-1.990987,2.729435,1.868659,0.147804,-3.045597,2.953575,-2.506421
996,-0.164971,0.583137,2.205,0.879697,2.250126,-2.424967,-2.538855,0.397889,-1.774842,2.397249
997,0.072315,2.703781,2.205,-0.753586,-2.706612,3.336742,-1.161077,-0.761985,-2.753538,0.205831
998,-2.749941,-0.026255,2.205,0.598604,-0.078856,-1.127984,2.571211,-1.017573,-1.986655,-0.626739


In [18]:
target = 42

population["Y"] = RFR.predict(population)
population["target_distance"]=abs(population["Y"]-np.ones(population_size)*target)

population

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y,target_distance
0,-1.628114,0.214818,2.205,0.267626,0.345327,1.692819,2.025580,-2.637219,-1.790029,-1.218535,-24.374881,66.374881
1,-2.894331,-0.430101,2.205,2.069404,-1.607556,2.855902,-0.576625,-2.371190,-0.456381,1.832598,-62.559839,104.559839
2,-2.751134,0.103683,2.205,0.758191,-4.055377,0.737091,2.717131,-1.008545,2.778488,-2.170113,-47.105431,89.105431
3,-1.233659,0.455716,2.205,-2.937998,1.896922,-2.525832,-1.699869,2.086461,-2.779818,-2.067288,-3.926449,45.926449
4,2.484953,-1.941905,2.205,1.204359,1.151090,-1.464077,-1.670803,0.399511,-2.030192,2.674985,-39.711971,81.711971
...,...,...,...,...,...,...,...,...,...,...,...,...
995,-1.890082,-1.484431,2.205,-1.990987,2.729435,1.868659,0.147804,-3.045597,2.953575,-2.506421,-83.701975,125.701975
996,-0.164971,0.583137,2.205,0.879697,2.250126,-2.424967,-2.538855,0.397889,-1.774842,2.397249,22.779680,19.220320
997,0.072315,2.703781,2.205,-0.753586,-2.706612,3.336742,-1.161077,-0.761985,-2.753538,0.205831,87.751569,45.751569
998,-2.749941,-0.026255,2.205,0.598604,-0.078856,-1.127984,2.571211,-1.017573,-1.986655,-0.626739,-51.930355,93.930355


In [19]:
population.sort_values("target_distance").head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y,target_distance
839,0.269496,0.909839,2.205,-2.213,1.682416,-1.67201,0.961502,-2.956416,-1.206826,-0.272499,42.272191,0.272191
962,0.215986,1.02603,2.205,-1.532169,-2.160893,2.69112,-2.152026,-0.732092,-1.414586,-0.330977,41.715584,0.284416
201,2.463675,-0.015391,2.205,1.88665,-1.239796,-0.406527,-3.590776,-2.167322,2.776707,2.92449,42.286428,0.286428
373,2.484096,0.04661,2.205,-0.612801,-3.363609,-1.215497,0.476461,-1.719768,-2.461214,0.961354,42.302135,0.302135
252,2.056797,0.056332,2.205,0.831216,-2.831777,-1.104942,-2.456445,-2.094134,1.247847,0.711295,41.604451,0.395549
958,2.46973,-0.046982,2.205,-1.48626,-1.297043,-0.160712,-2.550396,-0.921709,-0.674825,-2.750759,42.65837,0.65837
741,1.965697,-0.005422,2.205,-1.898572,-1.746507,-0.99436,1.103159,2.61745,-2.968558,0.949203,42.737029,0.737029
339,1.861652,0.145348,2.205,-2.658374,1.763791,-0.819339,1.82917,-0.243028,-2.032871,1.871788,41.170322,0.829678
708,-3.003303,1.736576,2.205,2.724165,-0.07811,3.169104,-3.010533,-2.510148,3.16253,-2.406085,40.94976,1.05024
843,-0.108102,1.052527,2.205,2.433419,0.25552,-1.045796,-0.943532,-2.084576,-2.704307,0.947706,40.903073,1.096927


Part (2)

In [20]:
def generate_min_max_population(df, constraints, generation_size):

  # The names, min-max and number of features are extracted from the DataFrame 
  features_nb = df.shape[1]
  features_names = df.columns
  df_min_max = df.describe().loc[["min","max"],:]

  # We initialize the new population DataFrame with zeros
  new_population = pd.DataFrame(np.zeros((generation_size,features_nb)), columns=features_names)
  
  # For each feature, we create a randomized array with a uniform distribution
  # between the observed min and max of the feature, except for the constrained
  # ones where the value is unique

  for column_name in features_names:
    if column_name in constraints.index:
      new_population[column_name] = np.ones(generation_size)*constraints["constrained_feature_value"].loc[column_name]
      
    else:  
      new_population[column_name] = np.random.uniform(df_min_max.loc["min",column_name], df_min_max.loc["max",column_name], generation_size)

  return new_population

In [21]:
def min_max_select(constraints, population_in, features_names, generation_size, population_out_size, target, model):
  
  # We create a new generation, based on the input population characteristics
  new_generation = generate_min_max_population(population_in, constraints, generation_size)

  # We append the original population to the new generation to keep the best
  # individuals of these two DataFrames
  new_generation = pd.concat([new_generation, population_in], ignore_index=True)
  
  # We calculate Y thanks to the model and the distance from target
  new_generation["Y"]= model.predict(new_generation)
  new_generation["target_distance"]=abs(new_generation["Y"]-np.ones(new_generation.shape[0])*target)
  
  # We sort individuals according to their distance from the target and
  # keep only the desired number of individuals
  new_generation = new_generation.sort_values(by="target_distance").head(population_out_size)

  return new_generation

In [13]:
# We might use this array to set values on specific features
constraints = pd.DataFrame({'constrained_feature': ["X1", "X3"], 'constrained_feature_value': [-1, 4]}).set_index("constrained_feature")

# We define the number of individuals at each generation and the selected number
generation_size = 100
population_out_size = 10

# We initiate the 1st population, based on the original dataset features
starting_population = generate_min_max_population(df.drop("Y", axis=1), constraints, generation_size)
features_names = starting_population.columns

# Target is set
target = 42

# We set the number of successive generations
generation_nb = 5000

# We set a variable to record the total number of individuals reached at each stage
individuals_nb = 0

# We define a variable to record every improvement on the target distance
memory  = 100

# We create a DataFrame to record the min target from distance at each iteration
results_min_max = pd.DataFrame(np.zeros((generation_nb,3)), columns=["min_target_distance","individuals_nb","time_elapsed_min_max"])

start_timer = time.time()

for i in range(generation_nb):
  
  # We either initiate the loop with the starting or previous population
  if i==0:
    population_in = starting_population
  else:
    population_in = population_out.drop(["Y","target_distance"], axis=1)
  
  # A new generation is created and only the best individuals are returned
  population_out = min_max_select(constraints, population_in, features_names, generation_size, population_out_size, target, RFR)

  # The current minimum distance from target is set and recorded
  current_min = population_out.iloc[0,population_out.shape[1]-1]
  results_min_max.loc[i,"min_target_distance"] = current_min

  # The incremental number of individuals created is calculated and recorded
  individuals_nb+=generation_size
  results_min_max.loc[i,"individuals_nb"] = individuals_nb


  results_min_max.loc[i,"time_elapsed_min_max"] = float(time.time()-start_timer)

  # In case there is a improvement on the minimum distance, we display it
  if current_min<memory:
    memory = current_min
    print(i, ":", memory)
  i+=1

0 : 1.847819156426283
1 : 0.025411508602410038
3 : 0.00839627020774003
4 : 0.000580106179967288
26 : 2.0817645989268385e-06
2252 : 1.047065161685623e-06
4979 : 4.353142699642376e-07


Part (3)

In [23]:
def std_dev_select(universe_constraints, constraints, population_in, features_names, generation_size, std_dev_factor, population_out_size, target, model):
  
  # The names, min-max and number of features are extracted from the DataFrame
  features_names = population_in.columns
  features_nb = population_in.shape[1]
  replication_factor = generation_size // population_in.shape[0]

  # We replicate the population_in according to the replication factor
  new_generation = pd.concat([population_in]*replication_factor, ignore_index=True)

  # For each feature, we create a uniform distribution between its min and max,
  # except for constrained values for which a unique value is set
  for column_name in features_names:
    
    if column_name in constraints.index:  
      new_generation[column_name] = np.ones(new_generation.shape[0])*constraints["constrained_feature_value"].loc[column_name]
    
    else:
      feature_min = universe_constraints.loc["min",column_name]
      feature_max = universe_constraints.loc["max",column_name]
      feature_std = universe_constraints.loc["std",column_name]

      # Updated values = Original Value - ( Std_dev_factor * Std_dev ) + ( 2 * Std_dev_factor * Std_dev * Random (0,1) )
      new_generation[column_name]=new_generation[column_name]-(std_dev_factor*feature_std)+2*std_dev_factor*feature_std*np.random.random(new_generation.shape[0])

      # If somes of the values generated go beyond original limits, limits are applied
      new_generation.loc[new_generation[column_name] < feature_min, column_name] = feature_min 
      new_generation.loc[new_generation[column_name] > feature_max, column_name] = feature_max


  # We append the original population to the new generation to keep the best
  # individuals of these two DataFrame
  new_generation = pd.concat([new_generation, population_in], ignore_index=True)
  
  # We calculate Y thanks to the model and the distance from target
  new_generation["Y"]= model.predict(new_generation)
  new_generation["target_distance"]=abs(new_generation["Y"]-np.ones(new_generation.shape[0])*target)


  # We sort individuals according to their distance from the target and
  # keep only the desired number of individuals
  new_generation = new_generation.sort_values(by="target_distance").head(population_out_size)

  return new_generation

In [25]:
# We might use this array to set absolute boundaries
universe_constraints = df.describe().loc[["min","max","std"],:]

# We might use this array to set values on specific features
constraints = pd.DataFrame({'constrained_feature': ["X1", "X3"], 'constrained_feature_value': [-1, 4]}).set_index("constrained_feature")

# We define the number of individuals at each generation and the selected number
generation_size = 100
population_out_size = 10

# We define the std_dev impact ratio on generated
std_dev_factor = 0.5

# We initiate the 1st population, based on the original dataset features
starting_population = generate_min_max_population(pd.DataFrame(X, columns=col_names), constraints, generation_size)
features_names = starting_population.columns

# Target is set
target = 42

# We set the number of successive generations
generation_nb = 5000

# We set a variable to record the total number of individuals reached at each stage
individuals_nb = 0

# We define a variable to record every improvement on the target distance
memory  = 100

# We create a DataFrame to record the min target from distance at each iteration
results_std_dev = pd.DataFrame(np.zeros((generation_nb,3)), columns=["min_target_distance","individuals_nb","time_elapsed_std_dev"])

start_timer = time.time()

for i in range(generation_nb):
  
  # We either initiate the loop with the starting or previous population
  if i==0:
    population_in = starting_population
  else:
    population_in = population_out.drop(["Y","target_distance"], axis=1)

  # As the number of generation increases, we reduce the standard deviation
  # multiplication factor to help fine-tune solutions
  
  if (i == 250) or (i == 500) or (i == 1000) or (i == 2500):
      std_dev_factor = std_dev_factor/2

  # A new generation is created and only the best individuals are returned
  population_out = std_dev_select(universe_constraints, constraints, population_in, features_names, generation_size, std_dev_factor, population_out_size, target, RFR)

  # The current minimum distance from target is set and recorded
  current_min = population_out.iloc[0,population_out.shape[1]-1]
  
  results_std_dev.loc[i,"min_target_distance"] = current_min

  # The incremental number of individuals created is calculated and recorded
  individuals_nb+=generation_size
  results_std_dev.loc[i,"individuals_nb"] = individuals_nb

  results_std_dev.loc[i,"time_elapsed_std_dev"] = float(time.time()-start_timer)

  # In case there is a improvement on the minimum distance, we display it
  if current_min<memory:
    memory = current_min
    print(i, ":", memory)
  i+=1

AttributeError: 'DataFrame' object has no attribute 'append'