From 0f0359b75280a2a7db91579d3c5fc1820d4a51a9 Mon Sep 17 00:00:00 2001 From: Raghu Ganti Date: Sat, 19 Jun 2021 21:09:38 -0400 Subject: [PATCH 1/2] 1. Fixing a minor error in Datamodel, where the node name map keys are used instead of values. The keys are strings and we need to use values, which are of node type. 2. Cleaning up notebooks, adding comments, removing files that are not useful. --- codeflare/pipelines/Datamodel.py | 2 +- main.py | 55 ------ notebooks/Cross Product.ipynb | 111 ----------- notebooks/Grid Search Sample.ipynb | 100 ++++++---- notebooks/Ray graph experiments.ipynb | 269 -------------------------- notebooks/lale_cross_val_score.ipynb | 243 ++++++++++------------- 6 files changed, 166 insertions(+), 614 deletions(-) delete mode 100644 main.py delete mode 100644 notebooks/Cross Product.ipynb delete mode 100644 notebooks/Ray graph experiments.ipynb diff --git a/codeflare/pipelines/Datamodel.py b/codeflare/pipelines/Datamodel.py index 73a5d7c..9b9ee36 100644 --- a/codeflare/pipelines/Datamodel.py +++ b/codeflare/pipelines/Datamodel.py @@ -898,7 +898,7 @@ def has_single_estimator(self): if len(self.get_output_nodes()) > 1: return False - for node in self.__node_name_map__.keys(): + for node in self.__node_name_map__.values(): is_node_estimator = (node.get_node_input_type() == NodeInputType.OR) if is_node_estimator: pre_nodes = self.get_pre_nodes(node) diff --git a/main.py b/main.py deleted file mode 100644 index 694ee23..0000000 --- a/main.py +++ /dev/null @@ -1,55 +0,0 @@ -# This is a sample Python script. - -# Press ⌃R to execute it or replace it with your code. -# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings. - -from sklearn.base import ClassifierMixin -from sklearn.ensemble import RandomForestClassifier - - -class ScaleTestEstimator(ClassifierMixin): - __num_iters__ = 100 - __randomforest_classifier__ : RandomForestClassifier = None - - def __init__(self, num_iters, rf_classifier: RandomForestClassifier): - self.__num_iters__ = num_iters - self.__randomforest_classifier__ = rf_classifier - - def fit(self, X, y): - for i in range(self.__num_iters__): - self.__randomforest_classifier__.fit(X, y) - - def score(self, X, y, sample_weight=None): - return self.__randomforest_classifier__.score(X, y, sample_weight) - - -def print_hi(name): - # Use a breakpoint in the code line below to debug your script. - print(f'Hi, {name}') # Press ⌘F8 to toggle the breakpoint. - - -# Press the green button in the gutter to run the script. -if __name__ == '__main__': - print_hi('PyCharm') - - import pandas as pd - - train = pd.read_csv('resources/data/train_ctrUa4K.csv') - test = pd.read_csv('resources/data/test_lAUu6dG.csv') - train = train.drop('Loan_ID', axis=1) - train.dtypes - - X = train.drop('Loan_Status', axis=1) - y = train['Loan_Status'] - from sklearn.model_selection import train_test_split - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - - from sklearn.tree import DecisionTreeClassifier - from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier - - c_a = ScaleTestEstimator(50, DecisionTreeClassifier()) - c_b = ScaleTestEstimator(50, RandomForestClassifier()) - c_c = ScaleTestEstimator(50, GradientBoostingClassifier()) - -# See PyCharm help at https://www.jetbrains.com/help/pycharm/ diff --git a/notebooks/Cross Product.ipynb b/notebooks/Cross Product.ipynb deleted file mode 100644 index b135a40..0000000 --- a/notebooks/Cross Product.ipynb +++ /dev/null @@ -1,111 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "id": "weighted-money", - "metadata": {}, - "outputs": [], - "source": [ - "l1 = [1, 2, 3]\n", - "l2 = [4, 5, 6]\n", - "l3 = [7, 8, 9]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "relevant-conclusion", - "metadata": {}, - "outputs": [], - "source": [ - "%config IPCompleter.use_jedi = False" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "downtown-alignment", - "metadata": {}, - "outputs": [], - "source": [ - "import itertools" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "raising-millennium", - "metadata": {}, - "outputs": [], - "source": [ - "lists = [l1, l2, l3]" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "changed-residence", - "metadata": {}, - "outputs": [], - "source": [ - "cp = itertools.product(*lists)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "operating-cartoon", - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "type.__new__() argument 1 must be str, not int", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mele\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcp\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mele\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m: type.__new__() argument 1 must be str, not int" - ] - } - ], - "source": [ - "for ele in cp:\n", - " print(type(*ele))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "upper-kansas", - "metadata": {}, - "outputs": [], - "source": [ - "def myfun(x):\n", - " print()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/Grid Search Sample.ipynb b/notebooks/Grid Search Sample.ipynb index da0bac5..72d1f3f 100644 --- a/notebooks/Grid Search Sample.ipynb +++ b/notebooks/Grid Search Sample.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "aff2abb3", + "metadata": {}, + "source": [ + "## Grid search sample\n", + "This notebook shows how CodeFlare pipelines can be used to perform grid search, where the standard parameters in a param grid can be input to the pipeline and one can call `grid_search_cv` to perform grid search." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -35,11 +44,12 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 20, "id": "b3344db0", "metadata": {}, "outputs": [], "source": [ + "# we will use a standard pipeline that is explained in more detail in a separate notebook/blog\n", "X_digits, y_digits = datasets.load_digits(return_X_y=True)" ] }, @@ -66,21 +76,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-06-09 17:11:12,675\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + "2021-06-19 20:59:26,196\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" ] }, { "data": { "text/plain": [ - "{'node_ip_address': '192.168.1.37',\n", - " 'raylet_ip_address': '192.168.1.37',\n", - " 'redis_address': '192.168.1.37:6379',\n", - " 'object_store_address': '/tmp/ray/session_2021-06-09_17-11-11_289352_82102/sockets/plasma_store',\n", - " 'raylet_socket_name': '/tmp/ray/session_2021-06-09_17-11-11_289352_82102/sockets/raylet',\n", + "{'node_ip_address': '9.211.53.245',\n", + " 'raylet_ip_address': '9.211.53.245',\n", + " 'redis_address': '9.211.53.245:6379',\n", + " 'object_store_address': '/tmp/ray/session_2021-06-19_20-59-24_761561_86418/sockets/plasma_store',\n", + " 'raylet_socket_name': '/tmp/ray/session_2021-06-19_20-59-24_761561_86418/sockets/raylet',\n", " 'webui_url': '127.0.0.1:8265',\n", - " 'session_dir': '/tmp/ray/session_2021-06-09_17-11-11_289352_82102',\n", - " 'metrics_export_port': 59484,\n", - " 'node_id': '69baeb54516bd687adc2c1ebadb922fcf1d7d68f4ae9f1289bb9f879'}" + " 'session_dir': '/tmp/ray/session_2021-06-19_20-59-24_761561_86418',\n", + " 'metrics_export_port': 61314,\n", + " 'node_id': 'd6148d041e7ad9fe60b016d54e9d91b6a0af574445694f2c2048f14d'}" ] }, "execution_count": 6, @@ -119,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "5bfebb1e", "metadata": {}, "outputs": [], @@ -129,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 9, "id": "dfce8aa9", "metadata": {}, "outputs": [ @@ -168,10 +178,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 16, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -183,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "id": "cf886598", "metadata": {}, "outputs": [], @@ -198,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "414f1f58", "metadata": {}, "outputs": [], @@ -206,9 +216,18 @@ "parameterized_pipeline = pipeline.get_parameterized_pipeline(pipeline_param=pipeline_param)" ] }, + { + "cell_type": "markdown", + "id": "4f1bae14", + "metadata": {}, + "source": [ + "### Expansion of parametric pipeline\n", + "Given the `param_grid`, the parameterized pipeline is \"expanded\" to reflect the parallelism. One can see that we now have 9 nodes reflecting the combination of the parameter grid. Further, note that the k-fold cross validation also results in parallelism, which is data level parallelism and not reflected in the DAG itself." + ] + }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "id": "05ce5b1a", "metadata": {}, "outputs": [ @@ -403,10 +422,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 15, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -418,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "id": "0af813c2", "metadata": {}, "outputs": [], @@ -429,7 +448,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 14, "id": "4020342e", "metadata": {}, "outputs": [ @@ -437,8 +456,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.5 s, sys: 903 ms, total: 2.41 s\n", - "Wall time: 9.33 s\n" + "CPU times: user 1.41 s, sys: 1.21 s, total: 2.63 s\n", + "Wall time: 9.21 s\n" ] } ], @@ -449,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "id": "bcf25c0d", "metadata": {}, "outputs": [ @@ -488,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 16, "id": "0aa813bc", "metadata": {}, "outputs": [ @@ -498,7 +517,7 @@ "\"pca__4{'copy': True, 'iterated_power': 'auto', 'n_components': 64, 'random_state': None, 'svd_solver': 'auto', 'tol': 0.0, 'whiten': False}=\\r\\nlogistic__1{'C': 0.046415888336127774, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 10000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.1, 'verbose': 0, 'warm_start': False}=pca__4{'copy': True, 'iterated_power': 'auto', 'n_components': 64, 'random_state': None, 'svd_solver': 'auto', 'tol': 0.0, 'whiten': False} \\r\\n\"" ] }, - "execution_count": 20, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -509,7 +528,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 17, "id": "d1b0b7f5", "metadata": {}, "outputs": [ @@ -519,7 +538,7 @@ "0.9260058805323429" ] }, - "execution_count": 21, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -530,11 +549,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "7db8d362", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Currently, we are in process of making more parity of the CF pipeline grid search API to provide similar\n", + "# methods as that of SKLearn Gridsearch APIs." + ] }, { "cell_type": "code", @@ -546,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "id": "877e3f63", "metadata": {}, "outputs": [], @@ -580,7 +602,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "id": "195c0d15", "metadata": {}, "outputs": [ @@ -590,8 +612,8 @@ "text": [ "Best parameter (CV score=0.920):\n", "{'logistic__C': 0.046415888336127774, 'pca__n_components': 45}\n", - "CPU times: user 1min 40s, sys: 39.5 s, total: 2min 19s\n", - "Wall time: 20.4 s\n" + "CPU times: user 1min 31s, sys: 46.2 s, total: 2min 17s\n", + "Wall time: 21.1 s\n" ] } ], @@ -601,6 +623,16 @@ "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", "print(search.best_params_)" ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c568665a", + "metadata": {}, + "outputs": [], + "source": [ + "ray.shutdown()" + ] } ], "metadata": { diff --git a/notebooks/Ray graph experiments.ipynb b/notebooks/Ray graph experiments.ipynb deleted file mode 100644 index 8f8aa3a..0000000 --- a/notebooks/Ray graph experiments.ipynb +++ /dev/null @@ -1,269 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "israeli-batch", - "metadata": {}, - "outputs": [], - "source": [ - "import ray" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "loose-projector", - "metadata": {}, - "outputs": [], - "source": [ - "ray.shutdown()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "renewable-western", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2021-06-04 10:55:36,533\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" - ] - }, - { - "data": { - "text/plain": [ - "{'node_ip_address': '192.168.1.37',\n", - " 'raylet_ip_address': '192.168.1.37',\n", - " 'redis_address': '192.168.1.37:6379',\n", - " 'object_store_address': '/tmp/ray/session_2021-06-04_10-55-35_103935_66596/sockets/plasma_store',\n", - " 'raylet_socket_name': '/tmp/ray/session_2021-06-04_10-55-35_103935_66596/sockets/raylet',\n", - " 'webui_url': '127.0.0.1:8265',\n", - " 'session_dir': '/tmp/ray/session_2021-06-04_10-55-35_103935_66596',\n", - " 'metrics_export_port': 62153,\n", - " 'node_id': 'b55f2340f68d5439d9728c6605bd1ae90b31d893b63c92dea02643fd'}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ray.init()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "scheduled-miami", - "metadata": {}, - "outputs": [], - "source": [ - "%config IPCompleter.use_jedi = False" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "leading-sheriff", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from sklearn.preprocessing import FunctionTransformer\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "\n", - "transformer = FunctionTransformer(np.log1p)\n", - "minmax_scaler = MinMaxScaler()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "9adf2a27", - "metadata": {}, - "outputs": [], - "source": [ - "import codeflare.pipelines.Datamodel as dm\n", - "\n", - "class FeatureUnion(dm.AndEstimator):\n", - " def __init__(self):\n", - " pass\n", - " \n", - " def fit_transform(self, xy_list: list):\n", - " return self.transform(xy_list)\n", - "\n", - " def get_estimator_type(self):\n", - " return 'transform'\n", - " \n", - " def transform(self, xy_list):\n", - " X_list = []\n", - " y_list = []\n", - " \n", - " for xy in xy_list:\n", - " X_list.append(xy.get_x())\n", - " X_concat = np.concatenate(X_list, axis=0)\n", - " \n", - " return dm.Xy(X_concat, None)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "olive-usage", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline = dm.Pipeline()\n", - "\n", - "node_a = dm.EstimatorNode('a', minmax_scaler)\n", - "node_b = dm.EstimatorNode('b', minmax_scaler)\n", - "node_c = dm.AndNode('c', FeatureUnion())\n", - "# node_d = dm.OrNode('d', transformer)\n", - "\n", - "pipeline.add_edge(node_a, node_c)\n", - "pipeline.add_edge(node_b, node_c)\n", - "# pipeline.add_edge(node_b, node_d)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "behind-dairy", - "metadata": {}, - "outputs": [], - "source": [ - "X = np.array([0, 1, 2, 3])\n", - "X = np.reshape(X, (4, 1))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "uniform-coordinator", - "metadata": {}, - "outputs": [], - "source": [ - "import codeflare.pipelines.Datamodel as dm" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "46de1312", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_input = dm.PipelineInput()\n", - "xy = dm.Xy(X, None)\n", - "pipeline_input.add_xy_arg(node_a, xy)\n", - "pipeline_input.add_xy_arg(node_b, xy)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "handy-offset", - "metadata": {}, - "outputs": [], - "source": [ - "import codeflare.pipelines.Runtime as rt" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "julian-clerk", - "metadata": {}, - "outputs": [], - "source": [ - "from codeflare.pipelines.Runtime import ExecutionType" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "specialized-health", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "e963bdd4", - "metadata": {}, - "outputs": [], - "source": [ - "out_Xyrefs = pipeline_output.get_xyrefs(node_c)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "living-destiny", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0. ]\n", - " [0.33333333]\n", - " [0.66666667]\n", - " [1. ]\n", - " [0. ]\n", - " [0.33333333]\n", - " [0.66666667]\n", - " [1. ]]\n", - "FeatureUnion()\n" - ] - } - ], - "source": [ - "for out_xyref in out_Xyrefs:\n", - " x = ray.get(out_xyref.get_Xref())\n", - " and_func = ray.get(out_xyref.get_curr_node_state_ref()).get_estimator()\n", - " print(x)\n", - " print(and_func)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0188d6f", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/lale_cross_val_score.ipynb b/notebooks/lale_cross_val_score.ipynb index 6515ee3..b59cfd1 100644 --- a/notebooks/lale_cross_val_score.ipynb +++ b/notebooks/lale_cross_val_score.ipynb @@ -1,142 +1,150 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from lale.datasets.openml import fetch" + "## LALE cross validation example\n", + "This example is in collaboration with the [LALE team](https://github.com/IBM/lale), which demonstrates how a LALE pipeline can be translated into a CodeFlare pipeline, targeting cross validation.\n", + "\n", + "It assumes that LALE is available and installed in your local environment.\n", + "\n", + "One can see from running this notebook that LALE cross validation is single threaded and takes ~10minutes on a laptop (depending on the configuration), whereas using CodeFlare pipelines running on Ray, this time is reduced to around 75 seconds (with 8x parallelism) for the 10 fold cross validation." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "(X_train, y_train), (X_test, y_test) = fetch(\"jungle_chess_2pcs_raw_endgame_complete\", \"classification\")" + "# Uncomment below to install lale for running this notebook\n", + "\n", + "# !pip install lale\n", + "# !pip install 'liac-arff>=2.4.0'" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "import ray\n", - "ray.shutdown()" + "from lale.datasets.openml import fetch" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# from lale.helpers import cross_val_score" + "(X_train, y_train), (X_test, y_test) = fetch(\"jungle_chess_2pcs_raw_endgame_complete\", \"classification\")" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# sk_pipeline = pipeline.export_to_sklearn_pipeline()" + "# First, we will show how this data can be used to do cross validation using a simple pipeline with random forest\n", + "from lale.lib.sklearn import PCA, Nystroem, SelectKBest, RandomForestClassifier\n", + "from lale.lib.lale import ConcatFeatures\n", + "\n", + "pipeline = (PCA() & Nystroem() & SelectKBest(k=3)) >> ConcatFeatures() >> RandomForestClassifier(n_estimators=200)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 7min 18s, sys: 8.91 s, total: 7min 27s\n", + "Wall time: 6min 59s\n" + ] + }, + { + "data": { + "text/plain": [ + "[0.8161838161838162,\n", + " 0.8105228105228105,\n", + " 0.8148518148518149,\n", + " 0.8218448218448219,\n", + " 0.8208458208458208,\n", + " 0.8111888111888111,\n", + " 0.8105228105228105,\n", + " 0.8181818181818182,\n", + " 0.8011325782811459,\n", + " 0.8121252498334444]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# sk_pipeline" + "%%time\n", + "from lale.helpers import cross_val_score\n", + "cross_val_score(pipeline, X_train, y_train, cv=10)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2021-05-26 11:34:55,429\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8267\u001b[39m\u001b[22m\n" + "2021-06-19 21:02:55,145\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266\u001b[39m\u001b[22m\n" ] }, { "data": { "text/plain": [ - "{'node_ip_address': '9.211.114.84',\n", - " 'raylet_ip_address': '9.211.114.84',\n", - " 'redis_address': '9.211.114.84:20725',\n", - " 'object_store_address': '/tmp/ray/session_2021-05-26_11-34-53_711763_35035/sockets/plasma_store',\n", - " 'raylet_socket_name': '/tmp/ray/session_2021-05-26_11-34-53_711763_35035/sockets/raylet',\n", - " 'webui_url': '127.0.0.1:8267',\n", - " 'session_dir': '/tmp/ray/session_2021-05-26_11-34-53_711763_35035',\n", - " 'metrics_export_port': 60055,\n", - " 'node_id': '6f77fe1897704e0f2f8967852a1ccd077086efebfb0634ec3ca3d130'}" + "{'node_ip_address': '9.211.53.245',\n", + " 'raylet_ip_address': '9.211.53.245',\n", + " 'redis_address': '9.211.53.245:29680',\n", + " 'object_store_address': '/tmp/ray/session_2021-06-19_21-02-53_442708_86627/sockets/plasma_store',\n", + " 'raylet_socket_name': '/tmp/ray/session_2021-06-19_21-02-53_442708_86627/sockets/raylet',\n", + " 'webui_url': '127.0.0.1:8266',\n", + " 'session_dir': '/tmp/ray/session_2021-06-19_21-02-53_442708_86627',\n", + " 'metrics_export_port': 61863,\n", + " 'node_id': 'eff8f3d5558252aa2d18c57b62891d1a169a7404979a63b6c9882a14'}" ] }, - "execution_count": 7, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# Start Ray and init\n", + "\n", "import ray\n", "ray.init(object_store_memory=16 * 1024 * 1024 * 1024)" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import codeflare.pipelines.Datamodel as dm" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import KFold, StratifiedKFold" - ] - }, - { - "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ + "from sklearn.model_selection import KFold, StratifiedKFold\n", "kf = StratifiedKFold(n_splits=10)" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline = dm.Pipeline()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -149,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -162,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -171,26 +179,23 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ + "import codeflare.pipelines.Datamodel as dm\n", + "\n", + "# Create the CF pipeline and the nodes, add the edge\n", + "pipeline = dm.Pipeline()\n", "node_fu = dm.EstimatorNode('feature_union', feature_union)\n", - "node_rf = dm.EstimatorNode('randomforest', random_forest)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ + "node_rf = dm.EstimatorNode('randomforest', random_forest)\n", + "\n", "pipeline.add_edge(node_fu, node_rf)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -200,7 +205,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -211,15 +216,15 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.39 s, sys: 1.64 s, total: 5.02 s\n", - "Wall time: 1min 22s\n" + "CPU times: user 3.06 s, sys: 2.11 s, total: 5.17 s\n", + "Wall time: 1min 15s\n" ] } ], @@ -230,25 +235,25 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.8185148185148186,\n", + "[0.8161838161838162,\n", + " 0.8118548118548119,\n", + " 0.8145188145188145,\n", + " 0.8198468198468198,\n", " 0.8175158175158175,\n", - " 0.8128538128538129,\n", - " 0.8195138195138195,\n", - " 0.8228438228438228,\n", - " 0.8168498168498168,\n", - " 0.8131868131868132,\n", - " 0.8161838161838162,\n", - " 0.7991339107261826,\n", - " 0.8077948034643571]" + " 0.8111888111888111,\n", + " 0.8158508158508159,\n", + " 0.8171828171828172,\n", + " 0.8037974683544303,\n", + " 0.8087941372418388]" ] }, - "execution_count": 21, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -259,61 +264,11 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 24, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "from lale.lib.sklearn import PCA, Nystroem, SelectKBest, RandomForestClassifier\n", - "from lale.lib.lale import ConcatFeatures\n", - "\n", - "pipeline = (PCA() & Nystroem() & SelectKBest(k=3)) >> ConcatFeatures() >> RandomForestClassifier(n_estimators=200)\n", - "# pipeline.visualize()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 7min 39s, sys: 13.1 s, total: 7min 52s\n", - "Wall time: 7min 13s\n" - ] - }, - { - "data": { - "text/plain": [ - "[0.8185148185148186,\n", - " 0.8101898101898102,\n", - " 0.8148518148518149,\n", - " 0.8201798201798202,\n", - " 0.8145188145188145,\n", - " 0.8185148185148186,\n", - " 0.8168498168498168,\n", - " 0.8125208125208125,\n", - " 0.7978014656895404,\n", - " 0.8114590273151232]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%time\n", - "from lale.helpers import cross_val_score\n", - "cross_val_score(pipeline, X_train, y_train, cv=10)" + "ray.shutdown()" ] }, { From 0674aec2ab46aeb6795a326f607cb177124626bc Mon Sep 17 00:00:00 2001 From: Raghu Ganti Date: Sun, 20 Jun 2021 11:58:11 -0400 Subject: [PATCH 2/2] Cleanup of sample pipeline --- notebooks/sample_pipeline.ipynb | 369 +++++++++++++++----------------- 1 file changed, 176 insertions(+), 193 deletions(-) diff --git a/notebooks/sample_pipeline.ipynb b/notebooks/sample_pipeline.ipynb index 9b1928f..594b42f 100644 --- a/notebooks/sample_pipeline.ipynb +++ b/notebooks/sample_pipeline.ipynb @@ -1,8 +1,17 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "0c3cf197", + "metadata": {}, + "source": [ + "## Sample pipeline\n", + "This is a sample pipeline drawn from a competition posted on [Kaggle](https://www.kaggle.com/ragharamya/loanprediction). A preprocessor followed by exploring multiple options in parallel is demonstrated below." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "described-lover", "metadata": {}, "outputs": [], @@ -12,10 +21,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "simplified-summit", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Gender object\n", + "Married object\n", + "Dependents object\n", + "Education object\n", + "Self_Employed object\n", + "ApplicantIncome int64\n", + "CoapplicantIncome float64\n", + "LoanAmount float64\n", + "Loan_Amount_Term float64\n", + "Credit_History float64\n", + "Property_Area object\n", + "Loan_Status object\n", + "dtype: object" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "train = pd.read_csv('../resources/data/train_ctrUa4K.csv')\n", @@ -26,30 +58,31 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "configured-clinton", - "metadata": {}, - "outputs": [], - "source": [ - "train.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "confident-union", "metadata": {}, "outputs": [], "source": [ + "# prepare the dataset for training\n", + "\n", "X = train.drop('Loan_Status', axis=1)\n", "y = train['Loan_Status']\n", "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" ] }, + { + "cell_type": "markdown", + "id": "5f1793c1", + "metadata": {}, + "source": [ + "## SKLearn pipeline\n", + "Below, we show how SKLearn is used to create a pipeline and then fit for each of the pipelines to explore multiple models." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "visible-compact", "metadata": {}, "outputs": [], @@ -75,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "stupid-miracle", "metadata": {}, "outputs": [], @@ -91,22 +124,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "accredited-japan", - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "start = time.time()\n", - "Xt = preprocessor.fit(X_train)\n", - "end = time.time()\n", - "print('Time taken: ' + str(end - start))" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "fundamental-builder", "metadata": {}, "outputs": [], @@ -118,47 +136,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "ahead-narrow", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.base import ClassifierMixin\n", - "from sklearn.base import BaseEstimator\n", - "\n", - "class ScaleTestEstimator(ClassifierMixin, BaseEstimator):\n", - " num_iters = 100\n", - " classifier : ClassifierMixin = None\n", - "\n", - " def __init__(self, num_iters, classifier: ClassifierMixin):\n", - " self.num_iters = num_iters\n", - " self.classifier = classifier\n", - "\n", - " def fit(self, X, y):\n", - " for i in range(self.num_iters):\n", - " self.classifier.fit(X, y)\n", - " return self\n", - " \n", - " def predict(self, X):\n", - " return self.classifier.predict(X)\n", - "\n", - " def score(self, X, y, sample_weight=None):\n", - " return self.classifier.score(X, y, sample_weight)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "specialized-provider", - "metadata": {}, - "outputs": [], - "source": [ - "Xt = preprocessor.fit_transform(X_train)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "above-masters", "metadata": {}, "outputs": [], @@ -174,106 +152,38 @@ " KNeighborsClassifier(3),\n", " SVC(kernel=\"rbf\", C=0.025, probability=True),\n", " NuSVC(probability=True),\n", - " DecisionTreeClassifier(),\n", " RandomForestClassifier(),\n", - " AdaBoostClassifier(),\n", " GradientBoostingClassifier()\n", " ]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "bacterial-morocco", - "metadata": {}, - "outputs": [], - "source": [ - "classifiers[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "sudden-british", - "metadata": {}, - "outputs": [], - "source": [ - "c_a = ScaleTestEstimator(50, DecisionTreeClassifier())\n", - "c_b = ScaleTestEstimator(50, RandomForestClassifier())\n", - "c_c = ScaleTestEstimator(50, GradientBoostingClassifier())\n", - "classifiers = [c_a, c_b, c_c]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "imposed-practice", - "metadata": {}, - "outputs": [], - "source": [ - "import sklearn.base as base" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "tired-breast", - "metadata": {}, - "outputs": [], - "source": [ - "base.is_classifier(c_a)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "overall-review", - "metadata": {}, - "outputs": [], - "source": [ - "base.clone(c_a)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "greatest-cancellation", "metadata": {}, "outputs": [], "source": [ - "import time\n", - "start = time.time()\n", - "\n", - "c_a = ScaleTestEstimator(50, DecisionTreeClassifier())\n", - "c_b = ScaleTestEstimator(50, RandomForestClassifier())\n", - "c_c = ScaleTestEstimator(50, GradientBoostingClassifier())\n", - "classifiers = [c_a, c_b, c_c]\n", - "\n", "classifier_results=[]\n", "for classifier in classifiers:\n", " pipe = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('classifier', classifier)])\n", " pipe.fit(X_train, y_train)\n", - " pipe.predict(X_train)\n", - " \n", - "end = time.time()\n", - "tt = end - start\n", - "print('time taken: ' + str(tt))" + " pipe.predict(X_train)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "a1757296", + "cell_type": "markdown", + "id": "fef872c4", "metadata": {}, - "outputs": [], "source": [ - "c_a.classifier.feature_importances_" + "## CodeFlare pipelines\n", + "Below, we show how this can be done with CodeFlare pipelines approach." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "coordinate-gossip", "metadata": {}, "outputs": [], @@ -284,17 +194,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "bfe20bd8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2021-06-20 11:53:04,117\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + ] + }, + { + "data": { + "text/plain": [ + "{'node_ip_address': '9.211.53.245',\n", + " 'raylet_ip_address': '9.211.53.245',\n", + " 'redis_address': '9.211.53.245:6379',\n", + " 'object_store_address': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928/sockets/plasma_store',\n", + " 'raylet_socket_name': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928/sockets/raylet',\n", + " 'webui_url': '127.0.0.1:8265',\n", + " 'session_dir': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928',\n", + " 'metrics_export_port': 64339,\n", + " 'node_id': '8c45750634c387b09b787fb7a0aa191df63f6d5274861bad27c00cb3'}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ray.init()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "invisible-consensus", "metadata": {}, "outputs": [], @@ -304,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "surface-recruitment", "metadata": {}, "outputs": [], @@ -314,32 +250,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "humanitarian-boards", "metadata": {}, "outputs": [], "source": [ "node_a = dm.EstimatorNode('preprocess', preprocessor)\n", - "node_b = dm.EstimatorNode('c_a', c_a)\n", - "node_c = dm.EstimatorNode('c_b', c_b)\n", - "node_d = dm.EstimatorNode('c_c', c_c)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "popular-bookmark", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline.add_edge(node_a, node_b)\n", - "pipeline.add_edge(node_a, node_c)\n", - "pipeline.add_edge(node_a, node_d)" + "node_0 = dm.EstimatorNode('node_0', classifiers[0])\n", + "node_1 = dm.EstimatorNode('node_1', classifiers[1])\n", + "node_2 = dm.EstimatorNode('node_2', classifiers[2])\n", + "node_3 = dm.EstimatorNode('node_3', classifiers[3])\n", + "node_4 = dm.EstimatorNode('node_4', classifiers[4])\n", + "\n", + "pipeline.add_edge(node_a, node_0)\n", + "pipeline.add_edge(node_a, node_1)\n", + "pipeline.add_edge(node_a, node_2)\n", + "pipeline.add_edge(node_a, node_3)\n", + "pipeline.add_edge(node_a, node_4)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "ef9ff37b", "metadata": {}, "outputs": [], @@ -352,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "mineral-analyst", "metadata": {}, "outputs": [], @@ -362,7 +294,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "literary-consolidation", "metadata": {}, "outputs": [], @@ -372,51 +304,102 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "educated-basement", "metadata": {}, "outputs": [], "source": [ - "start = time.time()\n", - "\n", "pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)\n", - "\n", - "node_b_output = pipeline_output.get_xyrefs(node_b)\n", - "node_c_output = pipeline_output.get_xyrefs(node_c)\n", - "node_d_output = pipeline_output.get_xyrefs(node_d)\n", - "\n", - "end = time.time()\n", - "print ('Time taken: ' + str(end - start))" + "node_0_output = pipeline_output.get_xyrefs(node_0)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "be9e1c3a", + "execution_count": 37, + "id": "4df7fb29", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "node_b_output" + "outputs[0]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "9e153862", + "execution_count": 39, + "id": "41955886", "metadata": {}, "outputs": [], "source": [ - "selected_pipeline = rt.select_pipeline(pipeline_output, node_b_output[0])" + "X_out = ray.get(outputs[0][0].get_Xref())" ] }, { "cell_type": "code", - "execution_count": null, - "id": "c612e1b0", + "execution_count": 40, + "id": "7e05451f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n", + " 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',\n", + " 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N',\n", + " 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n", + " 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',\n", + " 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N',\n", + " 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',\n", + " 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n", + " 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',\n", + " 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',\n", + " 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y'], dtype=object)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "print(selected_pipeline)" + "X_out" ] } ], @@ -439,7 +422,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.7.9" } }, "nbformat": 4,