Skip to content

Commit 75182a4

Browse files
Docs for datamodel is now complete
1 parent 5e3462f commit 75182a4

File tree

1 file changed

+190
-1
lines changed

1 file changed

+190
-1
lines changed

codeflare/pipelines/Datamodel.py

Lines changed: 190 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,37 +353,122 @@ class AndEstimator(BaseEstimator):
353353
method, (b) PREDICT: A regressor or classifier will call the predict method, whereas a non-regressor/classifier
354354
will call the transform method, and (c) SCORE: A regressor will call the score method, and a non-regressor/classifer
355355
will call the transform method.
356+
357+
Examples
358+
--------
359+
Here is a simple FeatureUnion as an AndEstimator:
360+
361+
.. code-block:: python
362+
363+
class FeatureUnion(dm.AndEstimator):
364+
def __init__(self):
365+
pass
366+
367+
def get_estimator_type(self):
368+
return 'transform'
369+
370+
def clone(self):
371+
return base.clone(self)
372+
373+
def fit_transform(self, xy_list):
374+
return self.transform(xy_list)
375+
376+
def transform(self, xy_list):
377+
X_list = []
378+
y_vec = None
379+
for xy in xy_list:
380+
X_list.append(xy.get_x())
381+
y_vec = xy.get_y()
382+
X_concat = np.concatenate(X_list, axis=1)
383+
return dm.Xy(X_concat, y_vec)
384+
385+
The above is doing a simple feature union by combining inputs from multiple edges and sending "back" a single
386+
concatenated xy. As a simple transform, it needs to only implement the `transform`, `fit_transform`, `clone`,
387+
and `get_estimator_type` methods.
356388
"""
357389
@abstractmethod
358390
def transform(self, xy_list: list) -> Xy:
391+
"""
392+
An abstract method that needs to be implemented if a simple estimator
393+
394+
:param xy_list: List of xy
395+
:return: A single xy
396+
"""
359397
raise NotImplementedError("And estimator needs to implement a transform method")
360398

361399
@abstractmethod
362400
def fit(self, xy_list: list):
401+
"""
402+
An abstract method that needs to be implemented if a regressor or a classifier
403+
404+
:param xy_list: List of xy
405+
:return: A single xy
406+
"""
363407
raise NotImplementedError("And estimator needs to implement a fit method")
364408

365409
@abstractmethod
366410
def fit_transform(self, xy_list: list):
411+
"""
412+
An abstract method that needs to be implemented if a simple estimator
413+
414+
:param xy_list: List of xy
415+
:return: A single xy
416+
"""
367417
raise NotImplementedError("And estimator needs to implement a fit method")
368418

369419
@abstractmethod
370420
def predict(self, xy_list: list) -> Xy:
421+
"""
422+
An abstract method that needs to be implemented if a regressor or a classifer
423+
424+
:param xy_list: List of xy
425+
:return: A single xy
426+
"""
371427
raise NotImplementedError("And classifier needs to implement the predict method")
372428

373429
@abstractmethod
374430
def score(self, xy_list: list) -> Xy:
431+
"""
432+
An abstract method that needs to be implemented if a regressor or a classifier
433+
434+
:param xy_list: List of xy
435+
:return: A single xy
436+
"""
375437
raise NotImplementedError("And classifier needs to implement the score method")
376438

377439
@abstractmethod
378440
def get_estimator_type(self):
441+
"""
442+
Any and estimator needs to implement this type, it is 'transform' if a simple transformer or is
443+
a 'classifier' if classifier and 'regressor' if a regressor.
444+
445+
:return: The type of the estimator
446+
"""
379447
raise NotImplementedError("And classifier needs to implement the get_estimator_type method")
380448

381449
@abstractmethod
382450
def clone(self):
451+
"""
452+
Abstract method and all estimators are supposed to implement these. Can be as simple as the basic
453+
python clone.
454+
455+
:return: A cloned estimator
456+
"""
383457
raise NotImplementedError("And estimator needs to implement a clone method")
384458

385459

386460
class AndNode(Node):
461+
"""
462+
Basic and node, that's capable of combining inputs from multiple edges. As such, it needs to have
463+
a AndEstimator implemented. The AndEstimator itself inherits from sklearn.BaseEstimator.
464+
465+
This estimator node is a typical AND node, with ANY firing semantics, and STATELESS state.
466+
467+
Examples
468+
--------
469+
470+
471+
"""
387472
def __init__(self, node_name: str, and_estimator: AndEstimator):
388473
super().__init__(node_name, and_estimator, NodeInputType.AND, NodeFiringType.ANY, NodeStateType.STATELESS)
389474

@@ -965,25 +1050,59 @@ def get_out_args(self):
9651050

9661051
class PipelineInput:
9671052
"""
968-
in_args is a dict from a node -> [Xy]
1053+
This is a holder to capture the input to the pipeline in an appropriate manner. Internally, it holds
1054+
the input from a node to a pointer to XYref, i.e. it only holds pointers. It does not hold the entire
1055+
data. This is key to distributing the data in the object store.
1056+
1057+
Examples
1058+
--------
1059+
The simplest way to add input to a node is by specifying the X and y args, the underlying platform will
1060+
take care of distributing it to the backend in-memory object storage.
1061+
1062+
.. code-block:: python
1063+
1064+
pipeline_input = dm.PipelineInput()
1065+
pipeline_input.add_xy_arg(node_a, dm.Xy(X_train, y_train))
9691066
"""
9701067
def __init__(self):
9711068
self.__in_args__ = {}
9721069

9731070
def add_xyref_ptr_arg(self, node: Node, xyref_ptr):
1071+
"""
1072+
A direct pointer input addition, this is typically used in internal methods, but enables the advanecd
1073+
developer to have direct access to the pipeline internals.
1074+
1075+
:param node: Node to which input needs to be added
1076+
:param xyref_ptr: The pointer to XYref
1077+
:return: None
1078+
"""
9741079
if node not in self.__in_args__:
9751080
self.__in_args__[node] = []
9761081

9771082
self.__in_args__[node].append(xyref_ptr)
9781083

9791084
def add_xyref_arg(self, node: Node, xyref: XYRef):
1085+
"""
1086+
A convenience method that adds a XYRef to the given node as input.
1087+
1088+
:param node: Node to which input needs to be added
1089+
:param xyref: The XYRef
1090+
:return: None
1091+
"""
9801092
if node not in self.__in_args__:
9811093
self.__in_args__[node] = []
9821094

9831095
xyref_ptr = ray.put(xyref)
9841096
self.__in_args__[node].append(xyref_ptr)
9851097

9861098
def add_xy_arg(self, node: Node, xy: Xy):
1099+
"""
1100+
The most common way of adding input to a node, by providing a xy.
1101+
1102+
:param node: Node to which input needs to be added
1103+
:param xy: The xy to be added
1104+
:return: None
1105+
"""
9871106
if node not in self.__in_args__:
9881107
self.__in_args__[node] = []
9891108

@@ -993,12 +1112,34 @@ def add_xy_arg(self, node: Node, xy: Xy):
9931112
self.add_xyref_arg(node, xyref)
9941113

9951114
def add_all(self, node, node_inargs):
1115+
"""
1116+
Adds all the in args to a given node, this is very useful when cloning a pipeline or "morphing" it
1117+
for grid search, etc.
1118+
1119+
:param node: Node to which input needs to be added
1120+
:param node_inargs: All the in args, which will be added whole
1121+
:return: None
1122+
"""
9961123
self.__in_args__[node] = node_inargs
9971124

9981125
def get_in_args(self):
1126+
"""
1127+
Returns the dict with the node to in args mapping
1128+
1129+
:return: The internal structure holding the in args
1130+
"""
9991131
return self.__in_args__
10001132

10011133
def get_parameterized_input(self, pipeline: Pipeline, parameterized_pipeline: Pipeline):
1134+
"""
1135+
This is meant to create a parameterized input given a pipeline and the parameterized pipeline.
1136+
This method will explore the nodes from pipeline that are matching the parameterized_pipeline
1137+
and copy the input over to the appropriate nodes of the parameterized_pipeline.
1138+
1139+
:param pipeline: The original pipeline
1140+
:param parameterized_pipeline: The parameterized pipeline corresponding to the original pipeline
1141+
:return: The parameterized input for the given parameterized_pipeline
1142+
"""
10021143
input_nodes = parameterized_pipeline.get_input_nodes()
10031144
parameterized_pipeline_input = PipelineInput()
10041145
for input_node in input_nodes:
@@ -1015,11 +1156,41 @@ def get_parameterized_input(self, pipeline: Pipeline, parameterized_pipeline: Pi
10151156

10161157

10171158
class PipelineParam:
1159+
"""
1160+
This class captures the pipeline parameters, which can be changed for various forms of exploration.
1161+
It is a fairly simple holder class capturing for each node, the corresponding estimators parameters
1162+
as a dictionary.
1163+
1164+
It also provides creating a PipelineParam object from a parameter grid, typically used in
1165+
sklearn.GridSearchCV.
1166+
1167+
Examples
1168+
--------
1169+
A simple example to create a pipeline param from a parameter grid.
1170+
1171+
.. code-block:: python
1172+
1173+
param_grid = {
1174+
'pca__n_components': [5, 15, 30, 45, 64],
1175+
'logistic__C': np.logspace(-4, 4, 4),
1176+
}
1177+
1178+
pipeline_param = dm.PipelineParam.from_param_grid(param_grid)
1179+
"""
10181180
def __init__(self):
10191181
self.__node_name_param_map__ = {}
10201182

10211183
@staticmethod
10221184
def from_param_grid(fit_params: dict):
1185+
"""
1186+
A method to create a a pipeline param object from a typical parameter grid with the standard
1187+
sklearn convention of __. For example, `pca__n_components` is a parameter for node name pca
1188+
and the parameter name is n_components. The parameter grid creates a full grid exploration of
1189+
the parameters.
1190+
1191+
:param fit_params: Dictionary of parameter name in the sklearn convention to the parameter list
1192+
:return: A pipeline param object
1193+
"""
10231194
pipeline_param = PipelineParam()
10241195
fit_params_nodes = {}
10251196
for pname, pval in fit_params.items():
@@ -1048,10 +1219,28 @@ def from_param_grid(fit_params: dict):
10481219
return pipeline_param
10491220

10501221
def add_param(self, node_name: str, params: dict):
1222+
"""
1223+
Add a parameter to the given node name
1224+
1225+
:param node_name: Node name to add parameter to
1226+
:param params: Parameter as a dictionary
1227+
:return: None
1228+
"""
10511229
self.__node_name_param_map__[node_name] = params
10521230

10531231
def get_param(self, node_name: str):
1232+
"""
1233+
Returns the parameter dict for the given node name
1234+
1235+
:param node_name: Node name to retrieve parameters for
1236+
:return: Dict of parameters
1237+
"""
10541238
return self.__node_name_param_map__[node_name]
10551239

10561240
def get_all_params(self):
1241+
"""
1242+
Return all the parmaters for the given pipeline param
1243+
1244+
:return: A dict from node name to the dictionary of parameters
1245+
"""
10571246
return self.__node_name_param_map__

0 commit comments

Comments
 (0)