From 0f0359b75280a2a7db91579d3c5fc1820d4a51a9 Mon Sep 17 00:00:00 2001
From: Raghu Ganti <rganti@us.ibm.com>
Date: Sat, 19 Jun 2021 21:09:38 -0400
Subject: [PATCH 1/2] 1. Fixing a minor error in Datamodel, where the node name
 map keys are used instead of values. The keys are strings and we need to use
 values, which are of node type. 2. Cleaning up notebooks, adding comments,
 removing files that are not useful.

---
 codeflare/pipelines/Datamodel.py      |   2 +-
 main.py                               |  55 ------
 notebooks/Cross Product.ipynb         | 111 -----------
 notebooks/Grid Search Sample.ipynb    | 100 ++++++----
 notebooks/Ray graph experiments.ipynb | 269 --------------------------
 notebooks/lale_cross_val_score.ipynb  | 243 ++++++++++-------------
 6 files changed, 166 insertions(+), 614 deletions(-)
 delete mode 100644 main.py
 delete mode 100644 notebooks/Cross Product.ipynb
 delete mode 100644 notebooks/Ray graph experiments.ipynb

diff --git a/codeflare/pipelines/Datamodel.py b/codeflare/pipelines/Datamodel.py
index 73a5d7c..9b9ee36 100644
--- a/codeflare/pipelines/Datamodel.py
+++ b/codeflare/pipelines/Datamodel.py
@@ -898,7 +898,7 @@ def has_single_estimator(self):
         if len(self.get_output_nodes()) > 1:
             return False
 
-        for node in self.__node_name_map__.keys():
+        for node in self.__node_name_map__.values():
             is_node_estimator = (node.get_node_input_type() == NodeInputType.OR)
             if is_node_estimator:
                 pre_nodes = self.get_pre_nodes(node)
diff --git a/main.py b/main.py
deleted file mode 100644
index 694ee23..0000000
--- a/main.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# This is a sample Python script.
-
-# Press ⌃R to execute it or replace it with your code.
-# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
-
-from sklearn.base import ClassifierMixin
-from sklearn.ensemble import RandomForestClassifier
-
-
-class ScaleTestEstimator(ClassifierMixin):
-    __num_iters__ = 100
-    __randomforest_classifier__ : RandomForestClassifier = None
-
-    def __init__(self, num_iters, rf_classifier: RandomForestClassifier):
-        self.__num_iters__ = num_iters
-        self.__randomforest_classifier__ = rf_classifier
-
-    def fit(self, X, y):
-        for i in range(self.__num_iters__):
-            self.__randomforest_classifier__.fit(X, y)
-
-    def score(self, X, y, sample_weight=None):
-        return self.__randomforest_classifier__.score(X, y, sample_weight)
-
-
-def print_hi(name):
-    # Use a breakpoint in the code line below to debug your script.
-    print(f'Hi, {name}')  # Press ⌘F8 to toggle the breakpoint.
-
-
-# Press the green button in the gutter to run the script.
-if __name__ == '__main__':
-    print_hi('PyCharm')
-
-    import pandas as pd
-
-    train = pd.read_csv('resources/data/train_ctrUa4K.csv')
-    test = pd.read_csv('resources/data/test_lAUu6dG.csv')
-    train = train.drop('Loan_ID', axis=1)
-    train.dtypes
-
-    X = train.drop('Loan_Status', axis=1)
-    y = train['Loan_Status']
-    from sklearn.model_selection import train_test_split
-
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
-
-    from sklearn.tree import DecisionTreeClassifier
-    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
-
-    c_a = ScaleTestEstimator(50, DecisionTreeClassifier())
-    c_b = ScaleTestEstimator(50, RandomForestClassifier())
-    c_c = ScaleTestEstimator(50, GradientBoostingClassifier())
-
-# See PyCharm help at https://www.jetbrains.com/help/pycharm/
diff --git a/notebooks/Cross Product.ipynb b/notebooks/Cross Product.ipynb
deleted file mode 100644
index b135a40..0000000
--- a/notebooks/Cross Product.ipynb	
+++ /dev/null
@@ -1,111 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "weighted-money",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "l1 = [1, 2, 3]\n",
-    "l2 = [4, 5, 6]\n",
-    "l3 = [7, 8, 9]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "relevant-conclusion",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%config IPCompleter.use_jedi = False"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "downtown-alignment",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import itertools"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "raising-millennium",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "lists = [l1, l2, l3]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "changed-residence",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cp = itertools.product(*lists)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "operating-cartoon",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "type.__new__() argument 1 must be str, not int",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-21-9bdb4cb8b88a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mele\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcp\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mele\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m: type.__new__() argument 1 must be str, not int"
-     ]
-    }
-   ],
-   "source": [
-    "for ele in cp:\n",
-    "    print(type(*ele))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "upper-kansas",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def myfun(x):\n",
-    "    print()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/Grid Search Sample.ipynb b/notebooks/Grid Search Sample.ipynb
index da0bac5..72d1f3f 100644
--- a/notebooks/Grid Search Sample.ipynb	
+++ b/notebooks/Grid Search Sample.ipynb	
@@ -1,5 +1,14 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "aff2abb3",
+   "metadata": {},
+   "source": [
+    "## Grid search sample\n",
+    "This notebook shows how CodeFlare pipelines can be used to perform grid search, where the standard parameters in a param grid can be input to the pipeline and one can call `grid_search_cv` to perform grid search."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -35,11 +44,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 20,
    "id": "b3344db0",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# we will use a standard pipeline that is explained in more detail in a separate notebook/blog\n",
     "X_digits, y_digits = datasets.load_digits(return_X_y=True)"
    ]
   },
@@ -66,21 +76,21 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2021-06-09 17:11:12,675\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
+      "2021-06-19 20:59:26,196\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "{'node_ip_address': '192.168.1.37',\n",
-       " 'raylet_ip_address': '192.168.1.37',\n",
-       " 'redis_address': '192.168.1.37:6379',\n",
-       " 'object_store_address': '/tmp/ray/session_2021-06-09_17-11-11_289352_82102/sockets/plasma_store',\n",
-       " 'raylet_socket_name': '/tmp/ray/session_2021-06-09_17-11-11_289352_82102/sockets/raylet',\n",
+       "{'node_ip_address': '9.211.53.245',\n",
+       " 'raylet_ip_address': '9.211.53.245',\n",
+       " 'redis_address': '9.211.53.245:6379',\n",
+       " 'object_store_address': '/tmp/ray/session_2021-06-19_20-59-24_761561_86418/sockets/plasma_store',\n",
+       " 'raylet_socket_name': '/tmp/ray/session_2021-06-19_20-59-24_761561_86418/sockets/raylet',\n",
        " 'webui_url': '127.0.0.1:8265',\n",
-       " 'session_dir': '/tmp/ray/session_2021-06-09_17-11-11_289352_82102',\n",
-       " 'metrics_export_port': 59484,\n",
-       " 'node_id': '69baeb54516bd687adc2c1ebadb922fcf1d7d68f4ae9f1289bb9f879'}"
+       " 'session_dir': '/tmp/ray/session_2021-06-19_20-59-24_761561_86418',\n",
+       " 'metrics_export_port': 61314,\n",
+       " 'node_id': 'd6148d041e7ad9fe60b016d54e9d91b6a0af574445694f2c2048f14d'}"
       ]
      },
      "execution_count": 6,
@@ -119,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "id": "5bfebb1e",
    "metadata": {},
    "outputs": [],
@@ -129,7 +139,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 9,
    "id": "dfce8aa9",
    "metadata": {},
    "outputs": [
@@ -168,10 +178,10 @@
        "</svg>\n"
       ],
       "text/plain": [
-       "<graphviz.dot.Digraph at 0x7fc7191ec3d0>"
+       "<graphviz.dot.Digraph at 0x7fa2a9010150>"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -183,7 +193,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "id": "cf886598",
    "metadata": {},
    "outputs": [],
@@ -198,7 +208,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "id": "414f1f58",
    "metadata": {},
    "outputs": [],
@@ -206,9 +216,18 @@
     "parameterized_pipeline = pipeline.get_parameterized_pipeline(pipeline_param=pipeline_param)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "4f1bae14",
+   "metadata": {},
+   "source": [
+    "### Expansion of parametric pipeline\n",
+    "Given the `param_grid`, the parameterized pipeline is \"expanded\" to reflect the parallelism. One can see that we now have 9 nodes reflecting the combination of the parameter grid. Further, note that the k-fold cross validation also results in parallelism, which is data level parallelism and not reflected in the DAG itself."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 12,
    "id": "05ce5b1a",
    "metadata": {},
    "outputs": [
@@ -403,10 +422,10 @@
        "</svg>\n"
       ],
       "text/plain": [
-       "<graphviz.dot.Digraph at 0x7fc708c53790>"
+       "<graphviz.dot.Digraph at 0x7fa2a90193d0>"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -418,7 +437,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "id": "0af813c2",
    "metadata": {},
    "outputs": [],
@@ -429,7 +448,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
    "id": "4020342e",
    "metadata": {},
    "outputs": [
@@ -437,8 +456,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 1.5 s, sys: 903 ms, total: 2.41 s\n",
-      "Wall time: 9.33 s\n"
+      "CPU times: user 1.41 s, sys: 1.21 s, total: 2.63 s\n",
+      "Wall time: 9.21 s\n"
      ]
     }
    ],
@@ -449,7 +468,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 15,
    "id": "bcf25c0d",
    "metadata": {},
    "outputs": [
@@ -488,7 +507,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
    "id": "0aa813bc",
    "metadata": {},
    "outputs": [
@@ -498,7 +517,7 @@
        "\"pca__4{'copy': True, 'iterated_power': 'auto', 'n_components': 64, 'random_state': None, 'svd_solver': 'auto', 'tol': 0.0, 'whiten': False}=\\r\\nlogistic__1{'C': 0.046415888336127774, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 10000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.1, 'verbose': 0, 'warm_start': False}=pca__4{'copy': True, 'iterated_power': 'auto', 'n_components': 64, 'random_state': None, 'svd_solver': 'auto', 'tol': 0.0, 'whiten': False} \\r\\n\""
       ]
      },
-     "execution_count": 20,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -509,7 +528,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 17,
    "id": "d1b0b7f5",
    "metadata": {},
    "outputs": [
@@ -519,7 +538,7 @@
        "0.9260058805323429"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -530,11 +549,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "id": "7db8d362",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# Currently, we are in process of making more parity of the CF pipeline grid search API to provide similar\n",
+    "# methods as that of SKLearn Gridsearch APIs."
+   ]
   },
   {
    "cell_type": "code",
@@ -546,7 +568,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 18,
    "id": "877e3f63",
    "metadata": {},
    "outputs": [],
@@ -580,7 +602,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 19,
    "id": "195c0d15",
    "metadata": {},
    "outputs": [
@@ -590,8 +612,8 @@
      "text": [
       "Best parameter (CV score=0.920):\n",
       "{'logistic__C': 0.046415888336127774, 'pca__n_components': 45}\n",
-      "CPU times: user 1min 40s, sys: 39.5 s, total: 2min 19s\n",
-      "Wall time: 20.4 s\n"
+      "CPU times: user 1min 31s, sys: 46.2 s, total: 2min 17s\n",
+      "Wall time: 21.1 s\n"
      ]
     }
    ],
@@ -601,6 +623,16 @@
     "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n",
     "print(search.best_params_)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "c568665a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ray.shutdown()"
+   ]
   }
  ],
  "metadata": {
diff --git a/notebooks/Ray graph experiments.ipynb b/notebooks/Ray graph experiments.ipynb
deleted file mode 100644
index 8f8aa3a..0000000
--- a/notebooks/Ray graph experiments.ipynb	
+++ /dev/null
@@ -1,269 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "israeli-batch",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import ray"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "loose-projector",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ray.shutdown()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "renewable-western",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2021-06-04 10:55:36,533\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'node_ip_address': '192.168.1.37',\n",
-       " 'raylet_ip_address': '192.168.1.37',\n",
-       " 'redis_address': '192.168.1.37:6379',\n",
-       " 'object_store_address': '/tmp/ray/session_2021-06-04_10-55-35_103935_66596/sockets/plasma_store',\n",
-       " 'raylet_socket_name': '/tmp/ray/session_2021-06-04_10-55-35_103935_66596/sockets/raylet',\n",
-       " 'webui_url': '127.0.0.1:8265',\n",
-       " 'session_dir': '/tmp/ray/session_2021-06-04_10-55-35_103935_66596',\n",
-       " 'metrics_export_port': 62153,\n",
-       " 'node_id': 'b55f2340f68d5439d9728c6605bd1ae90b31d893b63c92dea02643fd'}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ray.init()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "scheduled-miami",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%config IPCompleter.use_jedi = False"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "leading-sheriff",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "from sklearn.preprocessing import FunctionTransformer\n",
-    "from sklearn.preprocessing import MinMaxScaler\n",
-    "\n",
-    "transformer = FunctionTransformer(np.log1p)\n",
-    "minmax_scaler = MinMaxScaler()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "9adf2a27",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import codeflare.pipelines.Datamodel as dm\n",
-    "\n",
-    "class FeatureUnion(dm.AndEstimator):\n",
-    "    def __init__(self):\n",
-    "        pass\n",
-    "    \n",
-    "    def fit_transform(self, xy_list: list):\n",
-    "        return self.transform(xy_list)\n",
-    "\n",
-    "    def get_estimator_type(self):\n",
-    "        return 'transform'\n",
-    "    \n",
-    "    def transform(self, xy_list):\n",
-    "        X_list = []\n",
-    "        y_list = []\n",
-    "        \n",
-    "        for xy in xy_list:\n",
-    "            X_list.append(xy.get_x())\n",
-    "        X_concat = np.concatenate(X_list, axis=0)\n",
-    "        \n",
-    "        return dm.Xy(X_concat, None)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "olive-usage",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pipeline = dm.Pipeline()\n",
-    "\n",
-    "node_a = dm.EstimatorNode('a', minmax_scaler)\n",
-    "node_b = dm.EstimatorNode('b', minmax_scaler)\n",
-    "node_c = dm.AndNode('c', FeatureUnion())\n",
-    "# node_d = dm.OrNode('d', transformer)\n",
-    "\n",
-    "pipeline.add_edge(node_a, node_c)\n",
-    "pipeline.add_edge(node_b, node_c)\n",
-    "# pipeline.add_edge(node_b, node_d)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "behind-dairy",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X = np.array([0, 1, 2, 3])\n",
-    "X = np.reshape(X, (4, 1))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "uniform-coordinator",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import codeflare.pipelines.Datamodel as dm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "46de1312",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pipeline_input = dm.PipelineInput()\n",
-    "xy = dm.Xy(X, None)\n",
-    "pipeline_input.add_xy_arg(node_a, xy)\n",
-    "pipeline_input.add_xy_arg(node_b, xy)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "handy-offset",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import codeflare.pipelines.Runtime as rt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "julian-clerk",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from codeflare.pipelines.Runtime import ExecutionType"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "specialized-health",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "e963bdd4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "out_Xyrefs = pipeline_output.get_xyrefs(node_c)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "living-destiny",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[0.        ]\n",
-      " [0.33333333]\n",
-      " [0.66666667]\n",
-      " [1.        ]\n",
-      " [0.        ]\n",
-      " [0.33333333]\n",
-      " [0.66666667]\n",
-      " [1.        ]]\n",
-      "FeatureUnion()\n"
-     ]
-    }
-   ],
-   "source": [
-    "for out_xyref in out_Xyrefs:\n",
-    "    x = ray.get(out_xyref.get_Xref())\n",
-    "    and_func = ray.get(out_xyref.get_curr_node_state_ref()).get_estimator()\n",
-    "    print(x)\n",
-    "    print(and_func)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b0188d6f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/lale_cross_val_score.ipynb b/notebooks/lale_cross_val_score.ipynb
index 6515ee3..b59cfd1 100644
--- a/notebooks/lale_cross_val_score.ipynb
+++ b/notebooks/lale_cross_val_score.ipynb
@@ -1,142 +1,150 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": 1,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "from lale.datasets.openml import fetch"
+    "## LALE cross validation example\n",
+    "This example is in collaboration with the [LALE team](https://github.com/IBM/lale), which demonstrates how a LALE pipeline can be translated into a CodeFlare pipeline, targeting cross validation.\n",
+    "\n",
+    "It assumes that LALE is available and installed in your local environment.\n",
+    "\n",
+    "One can see from running this notebook that LALE cross validation is single threaded and takes ~10minutes on a laptop (depending on the configuration), whereas using CodeFlare pipelines running on Ray, this time is reduced to around 75 seconds (with 8x parallelism) for the 10 fold cross validation."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "(X_train, y_train), (X_test, y_test) = fetch(\"jungle_chess_2pcs_raw_endgame_complete\", \"classification\")"
+    "# Uncomment below to install lale for running this notebook\n",
+    "\n",
+    "# !pip install lale\n",
+    "# !pip install 'liac-arff>=2.4.0'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import ray\n",
-    "ray.shutdown()"
+    "from lale.datasets.openml import fetch"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# from lale.helpers import cross_val_score"
+    "(X_train, y_train), (X_test, y_test) = fetch(\"jungle_chess_2pcs_raw_endgame_complete\", \"classification\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# sk_pipeline = pipeline.export_to_sklearn_pipeline()"
+    "# First, we will show how this data can be used to do cross validation using a simple pipeline with random forest\n",
+    "from lale.lib.sklearn import PCA, Nystroem, SelectKBest, RandomForestClassifier\n",
+    "from lale.lib.lale import ConcatFeatures\n",
+    "\n",
+    "pipeline = (PCA() & Nystroem() & SelectKBest(k=3)) >> ConcatFeatures() >> RandomForestClassifier(n_estimators=200)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 7min 18s, sys: 8.91 s, total: 7min 27s\n",
+      "Wall time: 6min 59s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[0.8161838161838162,\n",
+       " 0.8105228105228105,\n",
+       " 0.8148518148518149,\n",
+       " 0.8218448218448219,\n",
+       " 0.8208458208458208,\n",
+       " 0.8111888111888111,\n",
+       " 0.8105228105228105,\n",
+       " 0.8181818181818182,\n",
+       " 0.8011325782811459,\n",
+       " 0.8121252498334444]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# sk_pipeline"
+    "%%time\n",
+    "from lale.helpers import cross_val_score\n",
+    "cross_val_score(pipeline, X_train, y_train, cv=10)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2021-05-26 11:34:55,429\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8267\u001b[39m\u001b[22m\n"
+      "2021-06-19 21:02:55,145\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266\u001b[39m\u001b[22m\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "{'node_ip_address': '9.211.114.84',\n",
-       " 'raylet_ip_address': '9.211.114.84',\n",
-       " 'redis_address': '9.211.114.84:20725',\n",
-       " 'object_store_address': '/tmp/ray/session_2021-05-26_11-34-53_711763_35035/sockets/plasma_store',\n",
-       " 'raylet_socket_name': '/tmp/ray/session_2021-05-26_11-34-53_711763_35035/sockets/raylet',\n",
-       " 'webui_url': '127.0.0.1:8267',\n",
-       " 'session_dir': '/tmp/ray/session_2021-05-26_11-34-53_711763_35035',\n",
-       " 'metrics_export_port': 60055,\n",
-       " 'node_id': '6f77fe1897704e0f2f8967852a1ccd077086efebfb0634ec3ca3d130'}"
+       "{'node_ip_address': '9.211.53.245',\n",
+       " 'raylet_ip_address': '9.211.53.245',\n",
+       " 'redis_address': '9.211.53.245:29680',\n",
+       " 'object_store_address': '/tmp/ray/session_2021-06-19_21-02-53_442708_86627/sockets/plasma_store',\n",
+       " 'raylet_socket_name': '/tmp/ray/session_2021-06-19_21-02-53_442708_86627/sockets/raylet',\n",
+       " 'webui_url': '127.0.0.1:8266',\n",
+       " 'session_dir': '/tmp/ray/session_2021-06-19_21-02-53_442708_86627',\n",
+       " 'metrics_export_port': 61863,\n",
+       " 'node_id': 'eff8f3d5558252aa2d18c57b62891d1a169a7404979a63b6c9882a14'}"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# Start Ray and init\n",
+    "\n",
     "import ray\n",
     "ray.init(object_store_memory=16 * 1024 * 1024 * 1024)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import codeflare.pipelines.Datamodel as dm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.model_selection import KFold, StratifiedKFold"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
+    "from sklearn.model_selection import KFold, StratifiedKFold\n",
     "kf = StratifiedKFold(n_splits=10)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pipeline = dm.Pipeline()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -149,7 +157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -162,7 +170,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -171,26 +179,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import codeflare.pipelines.Datamodel as dm\n",
+    "\n",
+    "# Create the CF pipeline and the nodes, add the edge\n",
+    "pipeline = dm.Pipeline()\n",
     "node_fu = dm.EstimatorNode('feature_union', feature_union)\n",
-    "node_rf = dm.EstimatorNode('randomforest', random_forest)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "node_rf = dm.EstimatorNode('randomforest', random_forest)\n",
+    "\n",
     "pipeline.add_edge(node_fu, node_rf)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -200,7 +205,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -211,15 +216,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 3.39 s, sys: 1.64 s, total: 5.02 s\n",
-      "Wall time: 1min 22s\n"
+      "CPU times: user 3.06 s, sys: 2.11 s, total: 5.17 s\n",
+      "Wall time: 1min 15s\n"
      ]
     }
    ],
@@ -230,25 +235,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[0.8185148185148186,\n",
+       "[0.8161838161838162,\n",
+       " 0.8118548118548119,\n",
+       " 0.8145188145188145,\n",
+       " 0.8198468198468198,\n",
        " 0.8175158175158175,\n",
-       " 0.8128538128538129,\n",
-       " 0.8195138195138195,\n",
-       " 0.8228438228438228,\n",
-       " 0.8168498168498168,\n",
-       " 0.8131868131868132,\n",
-       " 0.8161838161838162,\n",
-       " 0.7991339107261826,\n",
-       " 0.8077948034643571]"
+       " 0.8111888111888111,\n",
+       " 0.8158508158508159,\n",
+       " 0.8171828171828172,\n",
+       " 0.8037974683544303,\n",
+       " 0.8087941372418388]"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -259,61 +264,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from lale.lib.sklearn import PCA, Nystroem, SelectKBest, RandomForestClassifier\n",
-    "from lale.lib.lale import ConcatFeatures\n",
-    "\n",
-    "pipeline = (PCA() & Nystroem() & SelectKBest(k=3)) >> ConcatFeatures() >> RandomForestClassifier(n_estimators=200)\n",
-    "# pipeline.visualize()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 7min 39s, sys: 13.1 s, total: 7min 52s\n",
-      "Wall time: 7min 13s\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "[0.8185148185148186,\n",
-       " 0.8101898101898102,\n",
-       " 0.8148518148518149,\n",
-       " 0.8201798201798202,\n",
-       " 0.8145188145188145,\n",
-       " 0.8185148185148186,\n",
-       " 0.8168498168498168,\n",
-       " 0.8125208125208125,\n",
-       " 0.7978014656895404,\n",
-       " 0.8114590273151232]"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "from lale.helpers import cross_val_score\n",
-    "cross_val_score(pipeline, X_train, y_train, cv=10)"
+    "ray.shutdown()"
    ]
   },
   {

From 0674aec2ab46aeb6795a326f607cb177124626bc Mon Sep 17 00:00:00 2001
From: Raghu Ganti <rganti@us.ibm.com>
Date: Sun, 20 Jun 2021 11:58:11 -0400
Subject: [PATCH 2/2] Cleanup of sample pipeline

---
 notebooks/sample_pipeline.ipynb | 369 +++++++++++++++-----------------
 1 file changed, 176 insertions(+), 193 deletions(-)

diff --git a/notebooks/sample_pipeline.ipynb b/notebooks/sample_pipeline.ipynb
index 9b1928f..594b42f 100644
--- a/notebooks/sample_pipeline.ipynb
+++ b/notebooks/sample_pipeline.ipynb
@@ -1,8 +1,17 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0c3cf197",
+   "metadata": {},
+   "source": [
+    "## Sample pipeline\n",
+    "This is a sample pipeline drawn from a competition posted on [Kaggle](https://www.kaggle.com/ragharamya/loanprediction). A preprocessor followed by exploring multiple options in parallel is demonstrated below."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "described-lover",
    "metadata": {},
    "outputs": [],
@@ -12,10 +21,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "simplified-summit",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Gender                object\n",
+       "Married               object\n",
+       "Dependents            object\n",
+       "Education             object\n",
+       "Self_Employed         object\n",
+       "ApplicantIncome        int64\n",
+       "CoapplicantIncome    float64\n",
+       "LoanAmount           float64\n",
+       "Loan_Amount_Term     float64\n",
+       "Credit_History       float64\n",
+       "Property_Area         object\n",
+       "Loan_Status           object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import pandas as pd\n",
     "train = pd.read_csv('../resources/data/train_ctrUa4K.csv')\n",
@@ -26,30 +58,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "configured-clinton",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "confident-union",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# prepare the dataset for training\n",
+    "\n",
     "X = train.drop('Loan_Status', axis=1)\n",
     "y = train['Loan_Status']\n",
     "from sklearn.model_selection import train_test_split\n",
     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5f1793c1",
+   "metadata": {},
+   "source": [
+    "## SKLearn pipeline\n",
+    "Below, we show how SKLearn is used to create a pipeline and then fit for each of the pipelines to explore multiple models."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "visible-compact",
    "metadata": {},
    "outputs": [],
@@ -75,7 +108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "stupid-miracle",
    "metadata": {},
    "outputs": [],
@@ -91,22 +124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "accredited-japan",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "\n",
-    "start = time.time()\n",
-    "Xt = preprocessor.fit(X_train)\n",
-    "end = time.time()\n",
-    "print('Time taken: ' + str(end - start))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "fundamental-builder",
    "metadata": {},
    "outputs": [],
@@ -118,47 +136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "ahead-narrow",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.base import ClassifierMixin\n",
-    "from sklearn.base import BaseEstimator\n",
-    "\n",
-    "class ScaleTestEstimator(ClassifierMixin, BaseEstimator):\n",
-    "    num_iters = 100\n",
-    "    classifier : ClassifierMixin = None\n",
-    "\n",
-    "    def __init__(self, num_iters, classifier: ClassifierMixin):\n",
-    "        self.num_iters = num_iters\n",
-    "        self.classifier = classifier\n",
-    "\n",
-    "    def fit(self, X, y):\n",
-    "        for i in range(self.num_iters):\n",
-    "            self.classifier.fit(X, y)\n",
-    "        return self\n",
-    "            \n",
-    "    def predict(self, X):\n",
-    "        return self.classifier.predict(X)\n",
-    "\n",
-    "    def score(self, X, y, sample_weight=None):\n",
-    "        return self.classifier.score(X, y, sample_weight)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "specialized-provider",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Xt = preprocessor.fit_transform(X_train)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "above-masters",
    "metadata": {},
    "outputs": [],
@@ -174,106 +152,38 @@
     "    KNeighborsClassifier(3),\n",
     "    SVC(kernel=\"rbf\", C=0.025, probability=True),\n",
     "    NuSVC(probability=True),\n",
-    "    DecisionTreeClassifier(),\n",
     "    RandomForestClassifier(),\n",
-    "    AdaBoostClassifier(),\n",
     "    GradientBoostingClassifier()\n",
     "    ]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "bacterial-morocco",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "classifiers[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "sudden-british",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "c_a = ScaleTestEstimator(50, DecisionTreeClassifier())\n",
-    "c_b = ScaleTestEstimator(50, RandomForestClassifier())\n",
-    "c_c = ScaleTestEstimator(50, GradientBoostingClassifier())\n",
-    "classifiers = [c_a, c_b, c_c]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "imposed-practice",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sklearn.base as base"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "tired-breast",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "base.is_classifier(c_a)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "overall-review",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "base.clone(c_a)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "id": "greatest-cancellation",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import time\n",
-    "start = time.time()\n",
-    "\n",
-    "c_a = ScaleTestEstimator(50, DecisionTreeClassifier())\n",
-    "c_b = ScaleTestEstimator(50, RandomForestClassifier())\n",
-    "c_c = ScaleTestEstimator(50, GradientBoostingClassifier())\n",
-    "classifiers = [c_a, c_b, c_c]\n",
-    "\n",
     "classifier_results=[]\n",
     "for classifier in classifiers:\n",
     "    pipe = Pipeline(steps=[('preprocessor', preprocessor),\n",
     "                      ('classifier', classifier)])\n",
     "    pipe.fit(X_train, y_train)\n",
-    "    pipe.predict(X_train)\n",
-    "    \n",
-    "end = time.time()\n",
-    "tt = end - start\n",
-    "print('time taken: ' + str(tt))"
+    "    pipe.predict(X_train)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a1757296",
+   "cell_type": "markdown",
+   "id": "fef872c4",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "c_a.classifier.feature_importances_"
+    "## CodeFlare pipelines\n",
+    "Below, we show how this can be done with CodeFlare pipelines approach."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "id": "coordinate-gossip",
    "metadata": {},
    "outputs": [],
@@ -284,17 +194,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
    "id": "bfe20bd8",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-06-20 11:53:04,117\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'node_ip_address': '9.211.53.245',\n",
+       " 'raylet_ip_address': '9.211.53.245',\n",
+       " 'redis_address': '9.211.53.245:6379',\n",
+       " 'object_store_address': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928/sockets/plasma_store',\n",
+       " 'raylet_socket_name': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928/sockets/raylet',\n",
+       " 'webui_url': '127.0.0.1:8265',\n",
+       " 'session_dir': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928',\n",
+       " 'metrics_export_port': 64339,\n",
+       " 'node_id': '8c45750634c387b09b787fb7a0aa191df63f6d5274861bad27c00cb3'}"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "ray.init()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "id": "invisible-consensus",
    "metadata": {},
    "outputs": [],
@@ -304,7 +240,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
    "id": "surface-recruitment",
    "metadata": {},
    "outputs": [],
@@ -314,32 +250,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 32,
    "id": "humanitarian-boards",
    "metadata": {},
    "outputs": [],
    "source": [
     "node_a = dm.EstimatorNode('preprocess', preprocessor)\n",
-    "node_b = dm.EstimatorNode('c_a', c_a)\n",
-    "node_c = dm.EstimatorNode('c_b', c_b)\n",
-    "node_d = dm.EstimatorNode('c_c', c_c)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "popular-bookmark",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pipeline.add_edge(node_a, node_b)\n",
-    "pipeline.add_edge(node_a, node_c)\n",
-    "pipeline.add_edge(node_a, node_d)"
+    "node_0 = dm.EstimatorNode('node_0', classifiers[0])\n",
+    "node_1 = dm.EstimatorNode('node_1', classifiers[1])\n",
+    "node_2 = dm.EstimatorNode('node_2', classifiers[2])\n",
+    "node_3 = dm.EstimatorNode('node_3', classifiers[3])\n",
+    "node_4 = dm.EstimatorNode('node_4', classifiers[4])\n",
+    "\n",
+    "pipeline.add_edge(node_a, node_0)\n",
+    "pipeline.add_edge(node_a, node_1)\n",
+    "pipeline.add_edge(node_a, node_2)\n",
+    "pipeline.add_edge(node_a, node_3)\n",
+    "pipeline.add_edge(node_a, node_4)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 33,
    "id": "ef9ff37b",
    "metadata": {},
    "outputs": [],
@@ -352,7 +284,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
    "id": "mineral-analyst",
    "metadata": {},
    "outputs": [],
@@ -362,7 +294,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 35,
    "id": "literary-consolidation",
    "metadata": {},
    "outputs": [],
@@ -372,51 +304,102 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 36,
    "id": "educated-basement",
    "metadata": {},
    "outputs": [],
    "source": [
-    "start = time.time()\n",
-    "\n",
     "pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)\n",
-    "\n",
-    "node_b_output = pipeline_output.get_xyrefs(node_b)\n",
-    "node_c_output = pipeline_output.get_xyrefs(node_c)\n",
-    "node_d_output = pipeline_output.get_xyrefs(node_d)\n",
-    "\n",
-    "end = time.time()\n",
-    "print ('Time taken: ' + str(end - start))"
+    "node_0_output = pipeline_output.get_xyrefs(node_0)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "be9e1c3a",
+   "execution_count": 37,
+   "id": "4df7fb29",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[<codeflare.pipelines.Datamodel.XYRef at 0x7f97087200d0>]"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "node_b_output"
+    "outputs[0]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "9e153862",
+   "execution_count": 39,
+   "id": "41955886",
    "metadata": {},
    "outputs": [],
    "source": [
-    "selected_pipeline = rt.select_pipeline(pipeline_output, node_b_output[0])"
+    "X_out = ray.get(outputs[0][0].get_Xref())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "c612e1b0",
+   "execution_count": 40,
+   "id": "7e05451f",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n",
+       "       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',\n",
+       "       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y',\n",
+       "       'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N',\n",
+       "       'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y',\n",
+       "       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n",
+       "       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',\n",
+       "       'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n",
+       "       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n",
+       "       'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n",
+       "       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',\n",
+       "       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n",
+       "       'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n",
+       "       'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',\n",
+       "       'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
+       "       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n",
+       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y'], dtype=object)"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "print(selected_pipeline)"
+    "X_out"
    ]
   }
  ],
@@ -439,7 +422,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.6"
+   "version": "3.7.9"
   }
  },
  "nbformat": 4,