Skip to content

Commit

Permalink
Hacky demo of TPOTEnsemble
Browse files Browse the repository at this point in the history
  • Loading branch information
rhiever committed Jun 2, 2017
1 parent 25f5f56 commit 21a56ec
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 31 deletions.
37 changes: 11 additions & 26 deletions tpot/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from sklearn.preprocessing import FunctionTransformer, Imputer
from sklearn.model_selection import train_test_split
from sklearn.metrics.scorer import make_scorer
from sklearn.ensemble import VotingClassifier

from update_checker import update_check

Expand Down Expand Up @@ -201,6 +202,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,

self._pareto_front = None
self._optimized_pipeline = None
self._pipeline_ensemble_list = None
self.fitted_pipeline_ = None
self._fitted_imputer = None
self._pop = None
Expand Down Expand Up @@ -514,7 +516,7 @@ def pareto_eq(ind1, ind2):
try:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
pop, _ = eaMuPlusLambda(
pop, _, self._pipeline_ensemble_list = eaMuPlusLambda(
population=pop,
toolbox=self._toolbox,
mu=self.population_size,
Expand Down Expand Up @@ -549,19 +551,17 @@ def pareto_eq(ind1, ind2):

# Store the pipeline with the highest internal testing score
if self._pareto_front:
self._update_top_pipeline()

# It won't raise error for a small test like in a unit test because a few pipeline sometimes
# may fail due to the training data does not fit the operator's requirement.
if not self._optimized_pipeline:
if not self._pipeline_ensemble_list:
print('There was an error in the TPOT optimization '
'process. This could be because the data was '
'not formatted properly, or because data for '
'a regression problem was provided to the '
'TPOTClassifier object. Please make sure you '
'passed the data to TPOT correctly.')
else:
self.fitted_pipeline_ = self._toolbox.compile(expr=self._optimized_pipeline)
self.fitted_pipeline_ = VotingClassifier(estimators=self._pipeline_ensemble_list)

with warnings.catch_warnings():
warnings.simplefilter('ignore')
Expand All @@ -571,7 +571,7 @@ def pareto_eq(ind1, ind2):
# Add an extra line of spacing if the progress bar was used
if self.verbosity >= 2:
print('')
print('Best pipeline: {}'.format(self._optimized_pipeline))
print('Best pipeline: {}'.format(self._pipeline_ensemble_list))

# Store and fit the entire Pareto front as fitted models for convenience
self.pareto_front_fitted_pipelines_ = {}
Expand All @@ -589,15 +589,6 @@ def pareto_eq(ind1, ind2):
raise
return self

def _update_top_pipeline(self):
"""Helper function to update the _optimized_pipeline field."""
if self._pareto_front:
top_score = -float('inf')
for pipeline, pipeline_scores in zip(self._pareto_front.items, reversed(self._pareto_front.keys)):
if pipeline_scores.wvalues[1] > top_score:
self._optimized_pipeline = pipeline
top_score = pipeline_scores.wvalues[1]

def predict(self, features):
"""Use the optimized pipeline to predict the target for a feature set.
Expand Down Expand Up @@ -808,7 +799,7 @@ def _set_param_recursive(self, pipeline_steps, parameter, value):
setattr(obj, parameter, value)


def _evaluate_individuals(self, individuals, features, target, sample_weight=None, groups=None):
def _evaluate_individuals(self, individuals, features, target, sample_weight=None, groups=None, pipeline_ensemble_list=None):
"""Determine the fit of the provided individuals.
Parameters
Expand Down Expand Up @@ -856,21 +847,11 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non
fitnesses_dict[indidx] = (5000., -float('inf'))
if not self._pbar.disable:
self._pbar.update(1)
# Check if the individual was evaluated before
elif individual_str in self.evaluated_individuals_:
# Get fitness score from previous evaluation
fitnesses_dict[indidx] = self.evaluated_individuals_[individual_str]
if self.verbosity > 2:
self._pbar.write('Pipeline encountered that has previously been evaluated during the '
'optimization process. Using the score from the previous evaluation.')
if not self._pbar.disable:
self._pbar.update(1)
else:
try:
# Transform the tree expression into an sklearn pipeline
sklearn_pipeline = self._toolbox.compile(expr=individual)


# Fix random state when the operator allows and build sample weight dictionary
self._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42)

Expand All @@ -882,6 +863,10 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non
if not self._pbar.disable:
self._pbar.update(1)
continue

if pipeline_ensemble_list is not None:
sklearn_pipeline = VotingClassifier(estimators=pipeline_ensemble_list + [('evaluate', sklearn_pipeline)])

eval_individuals_str.append(individual_str)
operator_count_list.append(operator_count)
sklearn_pipeline_list.append(sklearn_pipeline)
Expand Down
21 changes: 16 additions & 5 deletions tpot/gp_deap.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar,

record = stats.compile(population) if stats is not None else {}
logbook.record(gen=0, nevals=len(invalid_ind), **record)

pipeline_ensemble_list = []

# Begin the generational process
for gen in range(1, ngen + 1):
Expand All @@ -169,17 +171,26 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar,
offspring = varOr(population, toolbox, lambda_, cxpb, mutpb)

# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
invalid_ind = offspring

# update pbar for valid_ind
if not pbar.disable:
pbar.update(len(offspring)-len(invalid_ind))
if not (max_time_mins is None) and pbar.n >= pbar.total:
pbar.total += lambda_

fitnesses = toolbox.evaluate(invalid_ind)
fitnesses = toolbox.evaluate(invalid_ind, pipeline_ensemble_list=pipeline_ensemble_list)
best_gen_ind = None
best_gen_fitness = -float('inf')
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
if fit[1] > best_gen_fitness:
best_gen_fitness = fit[1]
best_gen_ind = ind

best_gen_ind_sklearn_pipeline = toolbox.compile(expr=best_gen_ind)
best_gen_ind_ensemble_entry = ('pipeline{}'.format(gen), best_gen_ind_sklearn_pipeline)
pipeline_ensemble_list.append(best_gen_ind_ensemble_entry)

# Update the hall of fame with the generated individuals
if halloffame is not None:
Expand Down Expand Up @@ -211,7 +222,7 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar,
record = stats.compile(population) if stats is not None else {}
logbook.record(gen=gen, nevals=len(invalid_ind), **record)

return population, logbook
return population, logbook, pipeline_ensemble_list


def cxOnePoint(ind1, ind2):
Expand Down Expand Up @@ -351,7 +362,7 @@ def _wrapped_cross_val_score(sklearn_pipeline, features, target,
cv, scoring_function, sample_weight,
max_eval_time_mins, groups):
max_time_seconds = max(int(max_eval_time_mins * 60), 1)
sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)
#sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)
# build a job for cross_val_score
tmp_it = Interruptable_cross_val_score(
clone(sklearn_pipeline),
Expand All @@ -361,7 +372,7 @@ def _wrapped_cross_val_score(sklearn_pipeline, features, target,
cv=cv,
n_jobs=1,
verbose=0,
fit_params=sample_weight_dict,
#fit_params=sample_weight_dict,
groups=groups
)
tmp_it.start()
Expand Down

0 comments on commit 21a56ec

Please sign in to comment.