Docs for datamodel is now complete

raghukiran1224 · raghukiran1224 · commit 75182a4418aa · 2021-06-09T11:11:58.000-04:00
diff --git a/codeflare/pipelines/Datamodel.py b/codeflare/pipelines/Datamodel.py
@@ -353,37 +353,122 @@ class AndEstimator(BaseEstimator):
     method, (b) PREDICT: A regressor or classifier will call the predict method, whereas a non-regressor/classifier
     will call the transform method, and (c) SCORE: A regressor will call the score method, and a non-regressor/classifer
     will call the transform method.
+
+    Examples
+    --------
+    Here is a simple FeatureUnion as an AndEstimator:
+
+    .. code-block:: python
+
+        class FeatureUnion(dm.AndEstimator):
+            def __init__(self):
+                pass
+
+            def get_estimator_type(self):
+                return 'transform'
+
+            def clone(self):
+                return base.clone(self)
+
+            def fit_transform(self, xy_list):
+                return self.transform(xy_list)
+
+            def transform(self, xy_list):
+                X_list = []
+                y_vec = None
+                for xy in xy_list:
+                    X_list.append(xy.get_x())
+                    y_vec = xy.get_y()
+                X_concat = np.concatenate(X_list, axis=1)
+                return dm.Xy(X_concat, y_vec)
+
+    The above is doing a simple feature union by combining inputs from multiple edges and sending "back" a single
+    concatenated xy. As a simple transform, it needs to only implement the `transform`, `fit_transform`, `clone`,
+    and `get_estimator_type` methods.
     """
     @abstractmethod
     def transform(self, xy_list: list) -> Xy:
+        """
+        An abstract method that needs to be implemented if a simple estimator
+
+        :param xy_list: List of xy
+        :return: A single xy
+        """
         raise NotImplementedError("And estimator needs to implement a transform method")
 
     @abstractmethod
     def fit(self, xy_list: list):
+        """
+        An abstract method that needs to be implemented if a regressor or a classifier
+
+        :param xy_list: List of xy
+        :return: A single xy
+        """
         raise NotImplementedError("And estimator needs to implement a fit method")
 
     @abstractmethod
     def fit_transform(self, xy_list: list):
+        """
+        An abstract method that needs to be implemented if a simple estimator
+
+        :param xy_list: List of xy
+        :return: A single xy
+        """
         raise NotImplementedError("And estimator needs to implement a fit method")
 
     @abstractmethod
     def predict(self, xy_list: list) -> Xy:
+        """
+        An abstract method that needs to be implemented if a regressor or a classifer
+
+        :param xy_list: List of xy
+        :return: A single xy
+        """
         raise NotImplementedError("And classifier needs to implement the predict method")
 
     @abstractmethod
     def score(self, xy_list: list) -> Xy:
+        """
+        An abstract method that needs to be implemented if a regressor or a classifier
+
+        :param xy_list: List of xy
+        :return: A single xy
+        """
         raise NotImplementedError("And classifier needs to implement the score method")
 
     @abstractmethod
     def get_estimator_type(self):
+        """
+        Any and estimator needs to implement this type, it is 'transform' if a simple transformer or is
+        a 'classifier' if classifier and 'regressor' if a regressor.
+
+        :return: The type of the estimator
+        """
         raise NotImplementedError("And classifier needs to implement the get_estimator_type method")
 
     @abstractmethod
     def clone(self):
+        """
+        Abstract method and all estimators are supposed to implement these. Can be as simple as the basic
+        python clone.
+
+        :return: A cloned estimator
+        """
         raise NotImplementedError("And estimator needs to implement a clone method")
 
 
 class AndNode(Node):
+    """
+    Basic and node, that's capable of combining inputs from multiple edges. As such, it needs to have
+    a AndEstimator implemented. The AndEstimator itself inherits from sklearn.BaseEstimator.
+
+    This estimator node is a typical AND node, with ANY firing semantics, and STATELESS state.
+
+    Examples
+    --------
+
+
+    """
     def __init__(self, node_name: str, and_estimator: AndEstimator):
         super().__init__(node_name, and_estimator, NodeInputType.AND, NodeFiringType.ANY, NodeStateType.STATELESS)
 
@@ -965,25 +1050,59 @@ def get_out_args(self):
 
 class PipelineInput:
     """
-    in_args is a dict from a node -> [Xy]
+    This is a holder to capture the input to the pipeline in an appropriate manner. Internally, it holds
+    the input from a node to a pointer to XYref, i.e. it only holds pointers. It does not hold the entire
+    data. This is key to distributing the data in the object store.
+
+    Examples
+    --------
+    The simplest way to add input to a node is by specifying the X and y args, the underlying platform will
+    take care of distributing it to the backend in-memory object storage.
+
+    .. code-block:: python
+
+        pipeline_input = dm.PipelineInput()
+        pipeline_input.add_xy_arg(node_a, dm.Xy(X_train, y_train))
     """
     def __init__(self):
         self.__in_args__ = {}
 
     def add_xyref_ptr_arg(self, node: Node, xyref_ptr):
+        """
+        A direct pointer input addition, this is typically used in internal methods, but enables the advanecd
+        developer to have direct access to the pipeline internals.
+
+        :param node: Node to which input needs to be added
+        :param xyref_ptr: The pointer to XYref
+        :return: None
+        """
         if node not in self.__in_args__:
             self.__in_args__[node] = []
 
         self.__in_args__[node].append(xyref_ptr)
 
     def add_xyref_arg(self, node: Node, xyref: XYRef):
+        """
+        A convenience method that adds a XYRef to the given node as input.
+
+        :param node: Node to which input needs to be added
+        :param xyref: The XYRef
+        :return: None
+        """
         if node not in self.__in_args__:
             self.__in_args__[node] = []
 
         xyref_ptr = ray.put(xyref)
         self.__in_args__[node].append(xyref_ptr)
 
     def add_xy_arg(self, node: Node, xy: Xy):
+        """
+        The most common way of adding input to a node, by providing a xy.
+
+        :param node: Node to which input needs to be added
+        :param xy: The xy to be added
+        :return: None
+        """
         if node not in self.__in_args__:
             self.__in_args__[node] = []
 
@@ -993,12 +1112,34 @@ def add_xy_arg(self, node: Node, xy: Xy):
         self.add_xyref_arg(node, xyref)
 
     def add_all(self, node, node_inargs):
+        """
+        Adds all the in args to a given node, this is very useful when cloning a pipeline or "morphing" it
+        for grid search, etc.
+
+        :param node: Node to which input needs to be added
+        :param node_inargs: All the in args, which will be added whole
+        :return: None
+        """
         self.__in_args__[node] = node_inargs
 
     def get_in_args(self):
+        """
+        Returns the dict with the node to in args mapping
+
+        :return: The internal structure holding the in args
+        """
         return self.__in_args__
 
     def get_parameterized_input(self, pipeline: Pipeline, parameterized_pipeline: Pipeline):
+        """
+        This is meant to create a parameterized input given a pipeline and the parameterized pipeline.
+        This method will explore the nodes from pipeline that are matching the parameterized_pipeline
+        and copy the input over to the appropriate nodes of the parameterized_pipeline.
+
+        :param pipeline: The original pipeline
+        :param parameterized_pipeline: The parameterized pipeline corresponding to the original pipeline
+        :return: The parameterized input for the given parameterized_pipeline
+        """
         input_nodes = parameterized_pipeline.get_input_nodes()
         parameterized_pipeline_input = PipelineInput()
         for input_node in input_nodes:
@@ -1015,11 +1156,41 @@ def get_parameterized_input(self, pipeline: Pipeline, parameterized_pipeline: Pi
 
 
 class PipelineParam:
+    """
+    This class captures the pipeline parameters, which can be changed for various forms of exploration.
+    It is a fairly simple holder class capturing for each node, the corresponding estimators parameters
+    as a dictionary.
+
+    It also provides creating a PipelineParam object from a parameter grid, typically used in
+    sklearn.GridSearchCV.
+
+    Examples
+    --------
+    A simple example to create a pipeline param from a parameter grid.
+
+    .. code-block:: python
+
+        param_grid = {
+            'pca__n_components': [5, 15, 30, 45, 64],
+            'logistic__C': np.logspace(-4, 4, 4),
+        }
+
+        pipeline_param = dm.PipelineParam.from_param_grid(param_grid)
+    """
     def __init__(self):
         self.__node_name_param_map__ = {}
 
     @staticmethod
     def from_param_grid(fit_params: dict):
+        """
+        A method to create a a pipeline param object from a typical parameter grid with the standard
+        sklearn convention of __. For example, `pca__n_components` is a parameter for node name pca
+        and the parameter name is n_components. The parameter grid creates a full grid exploration of
+        the parameters.
+
+        :param fit_params: Dictionary of parameter name in the sklearn convention to the parameter list
+        :return: A pipeline param object
+        """
         pipeline_param = PipelineParam()
         fit_params_nodes = {}
         for pname, pval in fit_params.items():
@@ -1048,10 +1219,28 @@ def from_param_grid(fit_params: dict):
         return pipeline_param
 
     def add_param(self, node_name: str, params: dict):
+        """
+        Add a parameter to the given node name
+
+        :param node_name: Node name to add parameter to
+        :param params: Parameter as a dictionary
+        :return: None
+        """
         self.__node_name_param_map__[node_name] = params
 
     def get_param(self, node_name: str):
+        """
+        Returns the parameter dict for the given node name
+
+        :param node_name: Node name to retrieve parameters for
+        :return: Dict of parameters
+        """
         return self.__node_name_param_map__[node_name]
 
     def get_all_params(self):
+        """
+        Return all the parmaters for the given pipeline param
+
+        :return: A dict from node name to the dictionary of parameters
+        """
         return self.__node_name_param_map__