Hacky demo of TPOTEnsemble

EpistasisLab · Jun 2, 2017 · 21a56ec · 21a56ec
1 parent 25f5f56
commit 21a56ec
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 31 deletions.
diff --git a/tpot/base.py b/tpot/base.py
@@ -42,6 +42,7 @@
 from sklearn.preprocessing import FunctionTransformer, Imputer
 from sklearn.model_selection import train_test_split
 from sklearn.metrics.scorer import make_scorer
+from sklearn.ensemble import VotingClassifier
 
 from update_checker import update_check
 
@@ -201,6 +202,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
 
         self._pareto_front = None
         self._optimized_pipeline = None
+        self._pipeline_ensemble_list = None
         self.fitted_pipeline_ = None
         self._fitted_imputer = None
         self._pop = None
@@ -514,7 +516,7 @@ def pareto_eq(ind1, ind2):
         try:
             with warnings.catch_warnings():
                 warnings.simplefilter('ignore')
-                pop, _ = eaMuPlusLambda(
+                pop, _, self._pipeline_ensemble_list = eaMuPlusLambda(
                     population=pop,
                     toolbox=self._toolbox,
                     mu=self.population_size,
@@ -549,19 +551,17 @@ def pareto_eq(ind1, ind2):
 
                     # Store the pipeline with the highest internal testing score
                     if self._pareto_front:
-                        self._update_top_pipeline()
-
                         # It won't raise error for a small test like in a unit test because a few pipeline sometimes
                         # may fail due to the training data does not fit the operator's requirement.
-                        if not self._optimized_pipeline:
+                        if not self._pipeline_ensemble_list:
                             print('There was an error in the TPOT optimization '
                                   'process. This could be because the data was '
                                   'not formatted properly, or because data for '
                                   'a regression problem was provided to the '
                                   'TPOTClassifier object. Please make sure you '
                                   'passed the data to TPOT correctly.')
                         else:
-                            self.fitted_pipeline_ = self._toolbox.compile(expr=self._optimized_pipeline)
+                            self.fitted_pipeline_ = VotingClassifier(estimators=self._pipeline_ensemble_list)
 
                             with warnings.catch_warnings():
                                 warnings.simplefilter('ignore')
@@ -571,7 +571,7 @@ def pareto_eq(ind1, ind2):
                                 # Add an extra line of spacing if the progress bar was used
                                 if self.verbosity >= 2:
                                     print('')
-                                print('Best pipeline: {}'.format(self._optimized_pipeline))
+                                print('Best pipeline: {}'.format(self._pipeline_ensemble_list))
 
                             # Store and fit the entire Pareto front as fitted models for convenience
                             self.pareto_front_fitted_pipelines_ = {}
@@ -589,15 +589,6 @@ def pareto_eq(ind1, ind2):
                         raise
             return self
 
-    def _update_top_pipeline(self):
-        """Helper function to update the _optimized_pipeline field."""
-        if self._pareto_front:
-            top_score = -float('inf')
-            for pipeline, pipeline_scores in zip(self._pareto_front.items, reversed(self._pareto_front.keys)):
-                if pipeline_scores.wvalues[1] > top_score:
-                    self._optimized_pipeline = pipeline
-                    top_score = pipeline_scores.wvalues[1]
-
     def predict(self, features):
         """Use the optimized pipeline to predict the target for a feature set.
 
@@ -808,7 +799,7 @@ def _set_param_recursive(self, pipeline_steps, parameter, value):
                     setattr(obj, parameter, value)
 
 
-    def _evaluate_individuals(self, individuals, features, target, sample_weight=None, groups=None):
+    def _evaluate_individuals(self, individuals, features, target, sample_weight=None, groups=None, pipeline_ensemble_list=None):
         """Determine the fit of the provided individuals.
 
         Parameters
@@ -856,21 +847,11 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non
                 fitnesses_dict[indidx] = (5000., -float('inf'))
                 if not self._pbar.disable:
                     self._pbar.update(1)
-            # Check if the individual was evaluated before
-            elif individual_str in self.evaluated_individuals_:
-                # Get fitness score from previous evaluation
-                fitnesses_dict[indidx] = self.evaluated_individuals_[individual_str]
-                if self.verbosity > 2:
-                    self._pbar.write('Pipeline encountered that has previously been evaluated during the '
-                                     'optimization process. Using the score from the previous evaluation.')
-                if not self._pbar.disable:
-                    self._pbar.update(1)
             else:
                 try:
                     # Transform the tree expression into an sklearn pipeline
                     sklearn_pipeline = self._toolbox.compile(expr=individual)
 
-
                     # Fix random state when the operator allows and build sample weight dictionary
                     self._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42)
 
@@ -882,6 +863,10 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non
                     if not self._pbar.disable:
                         self._pbar.update(1)
                     continue
+
+                if pipeline_ensemble_list is not None:
+                    sklearn_pipeline = VotingClassifier(estimators=pipeline_ensemble_list + [('evaluate', sklearn_pipeline)])
+
                 eval_individuals_str.append(individual_str)
                 operator_count_list.append(operator_count)
                 sklearn_pipeline_list.append(sklearn_pipeline)

diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py
@@ -161,6 +161,8 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar,
 
     record = stats.compile(population) if stats is not None else {}
     logbook.record(gen=0, nevals=len(invalid_ind), **record)
+
+    pipeline_ensemble_list = []
 
     # Begin the generational process
     for gen in range(1, ngen + 1):
@@ -169,17 +171,26 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar,
         offspring = varOr(population, toolbox, lambda_, cxpb, mutpb)
 
         # Evaluate the individuals with an invalid fitness
-        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
+        invalid_ind = offspring
 
         # update pbar for valid_ind
         if not pbar.disable:
             pbar.update(len(offspring)-len(invalid_ind))
             if not (max_time_mins is None) and pbar.n >= pbar.total:
                 pbar.total += lambda_
 
-        fitnesses = toolbox.evaluate(invalid_ind)
+        fitnesses = toolbox.evaluate(invalid_ind, pipeline_ensemble_list=pipeline_ensemble_list)
+        best_gen_ind = None
+        best_gen_fitness = -float('inf')
         for ind, fit in zip(invalid_ind, fitnesses):
             ind.fitness.values = fit
+            if fit[1] > best_gen_fitness:
+                best_gen_fitness = fit[1]
+                best_gen_ind = ind
+
+        best_gen_ind_sklearn_pipeline = toolbox.compile(expr=best_gen_ind)
+        best_gen_ind_ensemble_entry = ('pipeline{}'.format(gen), best_gen_ind_sklearn_pipeline)
+        pipeline_ensemble_list.append(best_gen_ind_ensemble_entry)
 
         # Update the hall of fame with the generated individuals
         if halloffame is not None:
@@ -211,7 +222,7 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar,
         record = stats.compile(population) if stats is not None else {}
         logbook.record(gen=gen, nevals=len(invalid_ind), **record)
 
-    return population, logbook
+    return population, logbook, pipeline_ensemble_list
 
 
 def cxOnePoint(ind1, ind2):
@@ -351,7 +362,7 @@ def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                              cv, scoring_function, sample_weight,
                              max_eval_time_mins, groups):
     max_time_seconds = max(int(max_eval_time_mins * 60), 1)
-    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)
+    #sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)
     # build a job for cross_val_score
     tmp_it = Interruptable_cross_val_score(
         clone(sklearn_pipeline),
@@ -361,7 +372,7 @@ def _wrapped_cross_val_score(sklearn_pipeline, features, target,
         cv=cv,
         n_jobs=1,
         verbose=0,
-        fit_params=sample_weight_dict,
+        #fit_params=sample_weight_dict,
         groups=groups
     )
     tmp_it.start()