project-codeflare
diff --git a/‎codeflare/pipelines/Datamodel.py‎
Lines changed: 66 additions & 8 deletions b/‎codeflare/pipelines/Datamodel.py‎
Lines changed: 66 additions & 8 deletions
diff --git a/‎codeflare/pipelines/Exceptions.py‎
Lines changed: 10 additions & 1 deletion b/‎codeflare/pipelines/Exceptions.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎codeflare/pipelines/Runtime.py‎
Lines changed: 97 additions & 19 deletions b/‎codeflare/pipelines/Runtime.py‎
Lines changed: 97 additions & 19 deletions
@@ -1,12 +1,13 @@
 from abc import ABC, abstractmethod
-import uuid
 from enum import Enum
 
-
 import sklearn.base as base
 from sklearn.base import TransformerMixin
 from sklearn.base import BaseEstimator
 
+import ray
+import codeflare.pipelines.Exceptions as pe
+
 
 class Xy:
     """
@@ -98,14 +99,10 @@ def __init__(self, node_name, node_input_type: NodeInputType, node_firing_type:
         self.__node_input_type__ = node_input_type
         self.__node_firing_type__ = node_firing_type
         self.__node_state_type__ = node_state_type
-        self.__id__ = uuid.uuid4()
 
     def __str__(self):
         return self.__node_name__
 
-    def get_id(self):
-        return self.__id__
-
     def get_node_input_type(self):
         return self.__node_input_type__
 
@@ -125,7 +122,7 @@ def __hash__(self):
 
         :return: Hash code
         """
-        return self.__id__.__hash__()
+        return self.__node_name__.__hash__()
 
     def __eq__(self, other):
         """
@@ -137,7 +134,6 @@ def __eq__(self, other):
         """
         return (
                 self.__class__ == other.__class__ and
-                self.__id__ == other.__id__ and
                 self.__node_name__ == other.__node_name__
         )
 
@@ -373,3 +369,65 @@ def get_post_edges(self, node: Node):
     def is_terminal(self, node: Node):
         node_post_edges = self.get_post_edges(node)
         return len(node_post_edges) == 0
+
+    def get_terminal_nodes(self):
+        # dict from level to nodes
+        level_nodes = self.get_nodes_by_level()
+        max_level = self.compute_max_level()
+        return level_nodes[max_level]
+
+
+class PipelineOutput:
+    """
+    Pipeline output to keep reference counters so that pipelines can be materialized
+    """
+    def __init__(self, out_args, edge_args):
+        self.__out_args__ = out_args
+        self.__edge_args__ = edge_args
+
+    def get_xyrefs(self, node: Node):
+        if node in self.__out_args__:
+            xyrefs_ptr = self.__out_args__[node]
+        elif node in self.__edge_args__:
+            xyrefs_ptr = self.__edge_args__[node]
+        else:
+            raise pe.PipelineNodeNotFoundException("Node " + str(node) + " not found")
+
+        xyrefs = ray.get(xyrefs_ptr)
+        return xyrefs
+
+    def get_edge_args(self):
+        return self.__edge_args__
+
+
+class PipelineInput:
+    """
+    in_args is a dict from a node -> [Xy]
+    """
+    def __init__(self):
+        self.__in_args__ = {}
+
+    def add_xyref_ptr_arg(self, node: Node, xyref_ptr):
+        if node not in self.__in_args__:
+            self.__in_args__[node] = []
+
+        self.__in_args__[node].append(xyref_ptr)
+
+    def add_xyref_arg(self, node: Node, xyref: XYRef):
+        if node not in self.__in_args__:
+            self.__in_args__[node] = []
+
+        xyref_ptr = ray.put(xyref)
+        self.__in_args__[node].append(xyref_ptr)
+
+    def add_xy_arg(self, node: Node, xy: Xy):
+        if node not in self.__in_args__:
+            self.__in_args__[node] = []
+
+        x_ref = ray.put(xy.get_x())
+        y_ref = ray.put(xy.get_y())
+        xyref = XYRef(x_ref, y_ref)
+        self.add_xyref_arg(node, xyref)
+
+    def get_in_args(self):
+        return self.__in_args__
@@ -1,8 +1,17 @@
-
 class BasePipelineException(Exception):
     pass
 
 
 class PipelineSaveException(BasePipelineException):
     def __init__(self, message):
         self.message = message
+
+
+class PipelineNodeNotFoundException(BasePipelineException):
+    def __init__(self, message):
+        self.message = message
+
+
+class PipelineException(BasePipelineException):
+    def __init__(self, message):
+        self.message = message
@@ -4,6 +4,7 @@
 import codeflare.pipelines.Exceptions as pe
 
 import sklearn.base as base
+from sklearn.model_selection import BaseCrossValidator
 from enum import Enum
 
 from queue import SimpleQueue
@@ -16,14 +17,14 @@ class ExecutionType(Enum):
 
 
 @ray.remote
-def execute_or_node_remote(node: dm.EstimatorNode, train_mode: ExecutionType, xy_ref: dm.XYRef):
+def execute_or_node_remote(node: dm.EstimatorNode, mode: ExecutionType, xy_ref: dm.XYRef):
     estimator = node.get_estimator()
     # Blocking operation -- not avoidable
     X = ray.get(xy_ref.get_Xref())
     y = ray.get(xy_ref.get_yref())
 
     # TODO: Can optimize the node pointers without replicating them
-    if train_mode == ExecutionType.FIT:
+    if mode == ExecutionType.FIT:
         cloned_node = node.clone()
         prev_node_ptr = ray.put(node)
 
@@ -43,24 +44,17 @@ def execute_or_node_remote(node: dm.EstimatorNode, train_mode: ExecutionType, xy
             curr_node_ptr = ray.put(cloned_node)
             result = dm.XYRef(res_Xref, xy_ref.get_yref(), prev_node_ptr, curr_node_ptr, [xy_ref])
             return result
-    elif train_mode == ExecutionType.SCORE:
-        cloned_node = node.clone()
-        prev_node_ptr = ray.put(node)
-
+    elif mode == ExecutionType.SCORE:
         if base.is_classifier(estimator) or base.is_regressor(estimator):
-            cloned_estimator = cloned_node.get_estimator()
-            cloned_estimator.fit(X, y)
-            curr_node_ptr = ray.put(cloned_node)
-            res_Xref = ray.put(cloned_estimator.score(X, y))
-            result = dm.XYRef(res_Xref, xy_ref.get_yref(), prev_node_ptr, curr_node_ptr, [xy_ref])
+            estimator = node.get_estimator()
+            res_Xref = ray.put(estimator.score(X, y))
+            result = dm.XYRef(res_Xref, xy_ref.get_yref())
             return result
         else:
-            cloned_estimator = cloned_node.get_estimator()
-            res_Xref = ray.put(cloned_estimator.fit_transform(X, y))
-            curr_node_ptr = ray.put(cloned_node)
-            result = dm.XYRef(res_Xref, xy_ref.get_yref(), prev_node_ptr, curr_node_ptr, [xy_ref])
+            res_Xref = ray.put(estimator.transform(X))
+            result = dm.XYRef(res_Xref, xy_ref.get_yref())
             return result
-    elif train_mode == ExecutionType.PREDICT:
+    elif mode == ExecutionType.PREDICT:
         # Test mode does not clone as it is a simple predict or transform
         if base.is_classifier(estimator) or base.is_regressor(estimator):
             res_Xref = estimator.predict(X)
@@ -136,11 +130,12 @@ def execute_and_node(node, pre_edges, edge_args, post_edges):
             edge_args[post_edge].extend(exec_xyref_ptrs)
 
 
-def execute_pipeline(pipeline: dm.Pipeline, mode: ExecutionType, in_args: dict):
+def execute_pipeline(pipeline: dm.Pipeline, mode: ExecutionType, pipeline_input: dm.PipelineInput) -> dm.PipelineOutput:
     nodes_by_level = pipeline.get_nodes_by_level()
 
     # track args per edge
     edge_args = {}
+    in_args = pipeline_input.get_in_args()
     for node, node_in_args in in_args.items():
         pre_edges = pipeline.get_pre_edges(node)
         for pre_edge in pre_edges:
@@ -161,10 +156,10 @@ def execute_pipeline(pipeline: dm.Pipeline, mode: ExecutionType, in_args: dict):
         edge = dm.Edge(last_level_node, None)
         out_args[last_level_node] = edge_args[edge]
 
-    return out_args
+    return dm.PipelineOutput(out_args, edge_args)
 
 
-def select_pipeline(chosen_xyref: dm.XYRef):
+def select_pipeline(pipeline_output: dm.PipelineOutput, chosen_xyref: dm.XYRef):
     pipeline = dm.Pipeline()
     xyref_queue = SimpleQueue()
 
@@ -185,3 +180,86 @@ def select_pipeline(chosen_xyref: dm.XYRef):
             xyref_queue.put(prev_xyref)
 
     return pipeline
+
+
+@ray.remote(num_returns=2)
+def split(cross_validator: BaseCrossValidator, xy_ref):
+    x = ray.get(xy_ref.get_Xref())
+    y = ray.get(xy_ref.get_yref())
+
+    xy_train_refs = []
+    xy_test_refs = []
+
+    for train_index, test_index in cross_validator.split(x, y):
+        x_train, x_test = x[train_index], x[test_index]
+        y_train, y_test = y[train_index], y[test_index]
+
+        x_train_ref = ray.put(x_train)
+        y_train_ref = ray.put(y_train)
+        xy_train_ref = dm.XYRef(x_train_ref, y_train_ref)
+        xy_train_refs.append(xy_train_ref)
+
+        x_test_ref = ray.put(x_test)
+        y_test_ref = ray.put(y_test)
+        xy_test_ref = dm.XYRef(x_test_ref, y_test_ref)
+        xy_test_refs.append(xy_test_ref)
+
+    return xy_train_refs, xy_test_refs
+
+
+def cross_validate(cross_validator: BaseCrossValidator, pipeline: dm.Pipeline, pipeline_input: dm.PipelineInput):
+    pipeline_input_train = dm.PipelineInput()
+
+    pipeline_input_test = []
+    k = cross_validator.get_n_splits()
+    # add k pipeline inputs for testing
+    for i in range(k):
+        pipeline_input_test.append(dm.PipelineInput())
+
+    in_args = pipeline_input.get_in_args()
+    for node, xyref_ptrs in in_args.items():
+        # NOTE: The assumption is that this node has only one input, the check earlier will ensure this!
+        xyref_ptr = xyref_ptrs[0]
+        xy_train_refs_ptr, xy_test_refs_ptr = split.remote(cross_validator, xyref_ptr)
+        xy_train_refs = ray.get(xy_train_refs_ptr)
+        xy_test_refs = ray.get(xy_test_refs_ptr)
+
+        for xy_train_ref in xy_train_refs:
+            pipeline_input_train.add_xyref_arg(node, xy_train_ref)
+
+        # for testing, add only to the specific input
+        for i in range(k):
+            pipeline_input_test[i].add_xyref_arg(node, xy_test_refs[i])
+
+    # Ready for execution now that data has been prepared! This execution happens in parallel
+    # because of the underlying pipeline graph and multiple input objects
+    pipeline_output_train = execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input_train)
+
+    # Now we can choose the pipeline and then score for each of the chosen pipelines
+    out_nodes = pipeline.get_terminal_nodes()
+    if len(out_nodes) > 1:
+        raise pe.PipelineException("Cannot cross validate as output is not a single node")
+
+    out_node = out_nodes[0]
+    out_xyref_ptrs = pipeline_output_train.get_xyrefs(out_node)
+
+    k = cross_validator.get_n_splits()
+    if len(out_xyref_ptrs) != k:
+        raise pe.PipelineException("Number of outputs from pipeline fit is not equal to the folds from cross validator")
+
+    pipeline_score_outputs = []
+    # Below, jobs get submitted and then we can collect the results in the next loop
+    for i in range(k):
+        selected_pipeline = select_pipeline(pipeline_output_train, out_xyref_ptrs[i])
+        selected_pipeline_output = execute_pipeline(selected_pipeline, ExecutionType.SCORE, pipeline_input_test[i])
+        pipeline_score_outputs.append(selected_pipeline_output)
+
+    result_scores = []
+    for pipeline_score_output in pipeline_score_outputs:
+        pipeline_out_xyrefs = pipeline_score_output.get_xyrefs(out_node)
+        # again, only single xyref to be gotten out
+        pipeline_out_xyref = pipeline_out_xyrefs[0]
+        out_x = ray.get(pipeline_out_xyref.get_Xref())
+        result_scores.append(out_x)
+
+    return result_scores