Merge remote-tracking branch 'origin/develop' into complex-example-1

yuanchi2807 · yuanchi2807 · commit 6fe833add190 · 2021-06-10T10:38:25.000-04:00
diff --git a/README.md b/README.md
@@ -13,6 +13,8 @@ This project is under active development. Keep an eye on this page for our first
 
 See the [design document](https://docs.google.com/document/d/1t1K8N07TcbBKBgrcI6jf9tPow00cOKE9whnEVxOd4-U/edit) for more information on our design goals.
 
+This project uses ZenHub for tracking of issues and roadmap.
+
 ## Example notebooks
 
 **TODO:** Add instructions for running the notebooks in the `notebooks` directory.
diff --git a/codeflare/pipelines/Datamodel.py b/codeflare/pipelines/Datamodel.py
@@ -200,6 +200,9 @@ class Node(ABC):
     A node class that is an abstract one, this is capturing basic info re the Node.
     The hash code of this node is the name of the node and equality is defined if the
     node name and the type of the node match.
+
+    When doing a grid search, a node can be parameterized with new params for the estimator and updated. This
+    is an internal method used by grid search.
     """
 
     def __init__(self, node_name, estimator: BaseEstimator, node_input_type: NodeInputType, node_firing_type: NodeFiringType, node_state_type: NodeStateType):
@@ -210,6 +213,11 @@ def __init__(self, node_name, estimator: BaseEstimator, node_input_type: NodeInp
         self.__node_state_type__ = node_state_type
 
     def __str__(self):
+        """
+        Returns a string representation of the node along with the parameters of the estimator of the node.
+
+        :return: String representation of the node
+        """
         estimator_params_str = str(self.get_estimator().get_params())
         retval = self.__node_name__ + estimator_params_str
         return retval
@@ -247,9 +255,22 @@ def get_node_state_type(self) -> NodeStateType:
         return self.__node_state_type__
 
     def get_estimator(self):
+        """
+        Return the estimator of the node
+
+        :return: The node's estimator
+        """
         return self.__estimator__
 
     def get_parameterized_node(self, node_name, **params):
+        """
+        Get a parameterized node, given kwargs **params, convert this node and update the estimator with the
+        new set of parameters. It will clone the node and its underlying estimator.
+
+        :param node_name: New node name
+        :param params: Updated parameters
+        :return:
+        """
         cloned_node = self.clone()
         cloned_node.__node_name__ = node_name
         estimator = cloned_node.get_estimator()
@@ -311,7 +332,6 @@ def __init__(self, node_name: str, estimator: BaseEstimator):
         """
         super().__init__(node_name, estimator, NodeInputType.OR, NodeFiringType.ANY, NodeStateType.IMMUTABLE)
 
-
     def clone(self):
         """
         Clones the given node and the underlying estimator as well, if it was initialized with
@@ -323,6 +343,17 @@ def clone(self):
 
 
 class AndEstimator(BaseEstimator):
+    """
+    An and estimator, is part of the AndNode, it is very similar to a standard estimator, however the key
+    difference is that it takes a `xy_list` as input and outputs an `xy`, contrasting to the EstimatorNode,
+    which takes an input as `xy` and outputs `xy_t`.
+
+    In the pipeline execution, we expect three modes: (a) FIT: A regressor or classifier will call the fit
+    and then pass on the transform results downstream, a non-regressor/classifier will call the fit_transform
+    method, (b) PREDICT: A regressor or classifier will call the predict method, whereas a non-regressor/classifier
+    will call the transform method, and (c) SCORE: A regressor will call the score method, and a non-regressor/classifer
+    will call the transform method.
+    """
     @abstractmethod
     def transform(self, xy_list: list) -> Xy:
         raise NotImplementedError("And estimator needs to implement a transform method")
diff --git a/codeflare/pipelines/tests/test_Datamodel.py b/codeflare/pipelines/tests/test_Datamodel.py
@@ -7,32 +7,29 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.tree import DecisionTreeClassifier
+import sklearn.base as base
 import codeflare.pipelines.Datamodel as dm
 import codeflare.pipelines.Runtime as rt
 from codeflare.pipelines.Datamodel import Xy
 from codeflare.pipelines.Runtime import ExecutionType
 
-
 class FeatureUnion(dm.AndEstimator):
     def __init__(self):
         pass
-
-    def fit_transform(self, xy_list: list):
-        return self.transform(xy_list)
-
     def get_estimator_type(self):
         return 'transform'
-
+    def clone(self):
+        return base.clone(self)
+    def fit_transform(self, xy_list):
+        return self.transform(xy_list)
     def transform(self, xy_list):
         X_list = []
-        y_list = []
-
+        y_vec = None
         for xy in xy_list:
             X_list.append(xy.get_x())
-        X_concat = np.concatenate(X_list, axis=0)
-
-        return Xy(X_concat, None)
-
+            y_vec = xy.get_y()
+        X_concat = np.concatenate(X_list, axis=1)
+        return Xy(X_concat, y_vec)
 
 class MultibranchTestCase(unittest.TestCase):
 
diff --git a/codeflare/pipelines/tests/test_and.py b/codeflare/pipelines/tests/test_and.py
@@ -2,26 +2,30 @@
 import ray
 import pandas as pd
 import numpy as np
+import sklearn.base as base
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
 import codeflare.pipelines.Datamodel as dm
 import codeflare.pipelines.Runtime as rt
 from codeflare.pipelines.Datamodel import Xy
 from codeflare.pipelines.Datamodel import XYRef
 from codeflare.pipelines.Runtime import ExecutionType
 
-class FeatureUnion(dm.AndTransform):
+class FeatureUnion(dm.AndEstimator):
     def __init__(self):
         pass
-
+    def get_estimator_type(self):
+        return 'transform'
+    def clone(self):
+        return base.clone(self)
+    def fit_transform(self, xy_list):
+        return self.transform(xy_list)
     def transform(self, xy_list):
         X_list = []
         y_vec = None
-
         for xy in xy_list:
             X_list.append(xy.get_x())
             y_vec = xy.get_y()
         X_concat = np.concatenate(X_list, axis=1)
-
         return Xy(X_concat, y_vec)
 
 def test_two_tier_and():
diff --git a/codeflare/pipelines/tests/test_multibranch.py b/codeflare/pipelines/tests/test_multibranch.py
@@ -8,26 +8,30 @@
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.linear_model import LogisticRegression
+import sklearn.base as base
 import codeflare.pipelines.Datamodel as dm
 import codeflare.pipelines.Runtime as rt
 from codeflare.pipelines.Datamodel import Xy
 from codeflare.pipelines.Datamodel import XYRef
 from codeflare.pipelines.Runtime import ExecutionType
 
-class FeatureUnion(dm.AndTransform):
+class FeatureUnion(dm.AndEstimator):
     def __init__(self):
         pass
-
+    def get_estimator_type(self):
+        return 'transform'
+    def clone(self):
+        return base.clone(self)
+    def fit_transform(self, xy_list):
+        return self.transform(xy_list)
     def transform(self, xy_list):
         X_list = []
         y_vec = None
-
         for xy in xy_list:
             X_list.append(xy.get_x())
             y_vec = xy.get_y()
         X_concat = np.concatenate(X_list, axis=1)
-
-        return Xy(X_concat, y_vec.values.ravel())
+        return Xy(X_concat, y_vec)
 
 def test_multibranch_1():
 
diff --git a/codeflare/pipelines/tests/test_pipeline_predict.py b/codeflare/pipelines/tests/test_pipeline_predict.py
@@ -0,0 +1,95 @@
+import pytest
+import ray
+
+# Taking an example from sklearn pipeline to assert that
+# the classification report from a rediction from sklearn pipeline is
+# the same as that from the converted codeflare pipeline
+
+from sklearn import set_config
+set_config(display='diagram')
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.feature_selection import SelectKBest, f_classif
+from sklearn.pipeline import make_pipeline
+from sklearn.svm import LinearSVC
+from sklearn.metrics import classification_report
+
+import codeflare.pipelines.Datamodel as dm
+import codeflare.pipelines.Runtime as rt
+from codeflare.pipelines.Datamodel import Xy
+from codeflare.pipelines.Datamodel import XYRef
+from codeflare.pipelines.Runtime import ExecutionType
+
+#
+# prediction from an sklearn pipeline
+#
+
+def test_pipeline_predict():
+
+	ray.shutdown()
+	ray.init()
+
+	#
+	# prediction from an sklearn pipeline
+	#
+	X, y = make_classification(
+    	n_features=20, n_informative=3, n_redundant=0, n_classes=2,
+    	n_clusters_per_class=2, random_state=42)
+	X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
+	anova_filter = SelectKBest(f_classif, k=3)
+	clf = LinearSVC()
+
+	anova_svm = make_pipeline(anova_filter, clf)
+	anova_svm.fit(X_train, y_train)
+
+	y_pred = anova_svm.predict(X_test)
+
+	report_sklearn = classification_report(y_test, y_pred)
+	print(report_sklearn)
+
+	#
+	# constructing a codeflare pipeline
+	#
+	pipeline = dm.Pipeline()
+	node_anova_filter = dm.EstimatorNode('anova_filter', anova_filter)
+	node_clf = dm.EstimatorNode('clf', clf)
+	pipeline.add_edge(node_anova_filter, node_clf)
+
+	pipeline_input = dm.PipelineInput()
+	xy = dm.Xy(X_train, y_train)
+
+	pipeline_input.add_xy_arg(node_anova_filter, xy)
+
+	pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)
+
+	node_clf_output = pipeline_output.get_xyrefs(node_clf)
+
+	Xout = ray.get(node_clf_output[0].get_Xref())
+	yout = ray.get(node_clf_output[0].get_yref())
+
+	selected_pipeline = rt.select_pipeline(pipeline_output, node_clf_output[0])
+
+	pipeline_input = dm.PipelineInput()
+	pipeline_input.add_xy_arg(node_anova_filter, dm.Xy(X_test, y_test))
+
+	predict_output = rt.execute_pipeline(selected_pipeline, ExecutionType.PREDICT, pipeline_input)
+
+	predict_clf_output = predict_output.get_xyrefs(node_clf)
+
+	#y_pred = ray.get(predict_clf_output[0].get_yref())
+	y_pred = ray.get(predict_clf_output[0].get_Xref())
+
+
+	report_codeflare = classification_report(y_test, y_pred)
+
+	print(report_codeflare)
+
+	assert(report_sklearn == report_codeflare)
+
+	ray.shutdown()
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-v", __file__]))
+
diff --git a/codeflare/pipelines/utils.py b/codeflare/pipelines/utils.py
@@ -0,0 +1,20 @@
+import graphviz
+import codeflare.pipelines.Datamodel as dm
+
+
+def pipeline_to_graph(pipeline: dm.Pipeline) -> graphviz.Digraph:
+    """
+    Converts the given pipeline to a networkX graph for visualization.
+
+    :param pipeline: Pipeline to convert to networkX graph
+    :return: A directed graph representing this pipeline
+    """
+    graph = graphviz.Digraph()
+    pipeline_nodes = pipeline.get_nodes()
+    for pre_node in pipeline_nodes.values():
+        post_nodes = pipeline.get_post_nodes(pre_node)
+        graph.node(pre_node.get_node_name())
+        for post_node in post_nodes:
+            graph.node(post_node.get_node_name())
+            graph.edge(pre_node.get_node_name(), post_node.get_node_name())
+    return graph
diff --git a/codeflare_pipelines.egg-info/SOURCES.txt b/codeflare_pipelines.egg-info/SOURCES.txt
@@ -5,6 +5,7 @@ codeflare/pipelines/Datamodel.py
 codeflare/pipelines/Exceptions.py
 codeflare/pipelines/Runtime.py
 codeflare/pipelines/__init__.py
+codeflare/pipelines/utils.py
 codeflare_pipelines.egg-info/PKG-INFO
 codeflare_pipelines.egg-info/SOURCES.txt
 codeflare_pipelines.egg-info/dependency_links.txt
diff --git a/docs/.DS_Store b/docs/.DS_Store
diff --git a/notebooks/Grid Search Sample.ipynb b/notebooks/Grid Search Sample.ipynb
diff --git a/requirements.txt b/requirements.txt