diff --git a/codeflare/pipelines/Datamodel.py b/codeflare/pipelines/Datamodel.py index 73a5d7c..9b9ee36 100644 --- a/codeflare/pipelines/Datamodel.py +++ b/codeflare/pipelines/Datamodel.py @@ -898,7 +898,7 @@ def has_single_estimator(self): if len(self.get_output_nodes()) > 1: return False - for node in self.__node_name_map__.keys(): + for node in self.__node_name_map__.values(): is_node_estimator = (node.get_node_input_type() == NodeInputType.OR) if is_node_estimator: pre_nodes = self.get_pre_nodes(node) diff --git a/main.py b/main.py deleted file mode 100644 index 694ee23..0000000 --- a/main.py +++ /dev/null @@ -1,55 +0,0 @@ -# This is a sample Python script. - -# Press ⌃R to execute it or replace it with your code. -# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings. - -from sklearn.base import ClassifierMixin -from sklearn.ensemble import RandomForestClassifier - - -class ScaleTestEstimator(ClassifierMixin): - __num_iters__ = 100 - __randomforest_classifier__ : RandomForestClassifier = None - - def __init__(self, num_iters, rf_classifier: RandomForestClassifier): - self.__num_iters__ = num_iters - self.__randomforest_classifier__ = rf_classifier - - def fit(self, X, y): - for i in range(self.__num_iters__): - self.__randomforest_classifier__.fit(X, y) - - def score(self, X, y, sample_weight=None): - return self.__randomforest_classifier__.score(X, y, sample_weight) - - -def print_hi(name): - # Use a breakpoint in the code line below to debug your script. - print(f'Hi, {name}') # Press ⌘F8 to toggle the breakpoint. - - -# Press the green button in the gutter to run the script. -if __name__ == '__main__': - print_hi('PyCharm') - - import pandas as pd - - train = pd.read_csv('resources/data/train_ctrUa4K.csv') - test = pd.read_csv('resources/data/test_lAUu6dG.csv') - train = train.drop('Loan_ID', axis=1) - train.dtypes - - X = train.drop('Loan_Status', axis=1) - y = train['Loan_Status'] - from sklearn.model_selection import train_test_split - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - - from sklearn.tree import DecisionTreeClassifier - from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier - - c_a = ScaleTestEstimator(50, DecisionTreeClassifier()) - c_b = ScaleTestEstimator(50, RandomForestClassifier()) - c_c = ScaleTestEstimator(50, GradientBoostingClassifier()) - -# See PyCharm help at https://www.jetbrains.com/help/pycharm/ diff --git a/notebooks/Cross Product.ipynb b/notebooks/Cross Product.ipynb deleted file mode 100644 index b135a40..0000000 --- a/notebooks/Cross Product.ipynb +++ /dev/null @@ -1,111 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "id": "weighted-money", - "metadata": {}, - "outputs": [], - "source": [ - "l1 = [1, 2, 3]\n", - "l2 = [4, 5, 6]\n", - "l3 = [7, 8, 9]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "relevant-conclusion", - "metadata": {}, - "outputs": [], - "source": [ - "%config IPCompleter.use_jedi = False" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "downtown-alignment", - "metadata": {}, - "outputs": [], - "source": [ - "import itertools" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "raising-millennium", - "metadata": {}, - "outputs": [], - "source": [ - "lists = [l1, l2, l3]" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "changed-residence", - "metadata": {}, - "outputs": [], - "source": [ - "cp = itertools.product(*lists)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "operating-cartoon", - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "type.__new__() argument 1 must be str, not int", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mele\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcp\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mele\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m: type.__new__() argument 1 must be str, not int" - ] - } - ], - "source": [ - "for ele in cp:\n", - " print(type(*ele))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "upper-kansas", - "metadata": {}, - "outputs": [], - "source": [ - "def myfun(x):\n", - " print()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/Grid Search Sample.ipynb b/notebooks/Grid Search Sample.ipynb index da0bac5..72d1f3f 100644 --- a/notebooks/Grid Search Sample.ipynb +++ b/notebooks/Grid Search Sample.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "aff2abb3", + "metadata": {}, + "source": [ + "## Grid search sample\n", + "This notebook shows how CodeFlare pipelines can be used to perform grid search, where the standard parameters in a param grid can be input to the pipeline and one can call `grid_search_cv` to perform grid search." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -35,11 +44,12 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 20, "id": "b3344db0", "metadata": {}, "outputs": [], "source": [ + "# we will use a standard pipeline that is explained in more detail in a separate notebook/blog\n", "X_digits, y_digits = datasets.load_digits(return_X_y=True)" ] }, @@ -66,21 +76,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-06-09 17:11:12,675\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + "2021-06-19 20:59:26,196\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" ] }, { "data": { "text/plain": [ - "{'node_ip_address': '192.168.1.37',\n", - " 'raylet_ip_address': '192.168.1.37',\n", - " 'redis_address': '192.168.1.37:6379',\n", - " 'object_store_address': '/tmp/ray/session_2021-06-09_17-11-11_289352_82102/sockets/plasma_store',\n", - " 'raylet_socket_name': '/tmp/ray/session_2021-06-09_17-11-11_289352_82102/sockets/raylet',\n", + "{'node_ip_address': '9.211.53.245',\n", + " 'raylet_ip_address': '9.211.53.245',\n", + " 'redis_address': '9.211.53.245:6379',\n", + " 'object_store_address': '/tmp/ray/session_2021-06-19_20-59-24_761561_86418/sockets/plasma_store',\n", + " 'raylet_socket_name': '/tmp/ray/session_2021-06-19_20-59-24_761561_86418/sockets/raylet',\n", " 'webui_url': '127.0.0.1:8265',\n", - " 'session_dir': '/tmp/ray/session_2021-06-09_17-11-11_289352_82102',\n", - " 'metrics_export_port': 59484,\n", - " 'node_id': '69baeb54516bd687adc2c1ebadb922fcf1d7d68f4ae9f1289bb9f879'}" + " 'session_dir': '/tmp/ray/session_2021-06-19_20-59-24_761561_86418',\n", + " 'metrics_export_port': 61314,\n", + " 'node_id': 'd6148d041e7ad9fe60b016d54e9d91b6a0af574445694f2c2048f14d'}" ] }, "execution_count": 6, @@ -119,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "5bfebb1e", "metadata": {}, "outputs": [], @@ -129,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 9, "id": "dfce8aa9", "metadata": {}, "outputs": [ @@ -168,10 +178,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 16, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -183,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "id": "cf886598", "metadata": {}, "outputs": [], @@ -198,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "414f1f58", "metadata": {}, "outputs": [], @@ -206,9 +216,18 @@ "parameterized_pipeline = pipeline.get_parameterized_pipeline(pipeline_param=pipeline_param)" ] }, + { + "cell_type": "markdown", + "id": "4f1bae14", + "metadata": {}, + "source": [ + "### Expansion of parametric pipeline\n", + "Given the `param_grid`, the parameterized pipeline is \"expanded\" to reflect the parallelism. One can see that we now have 9 nodes reflecting the combination of the parameter grid. Further, note that the k-fold cross validation also results in parallelism, which is data level parallelism and not reflected in the DAG itself." + ] + }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "id": "05ce5b1a", "metadata": {}, "outputs": [ @@ -403,10 +422,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 15, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -418,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "id": "0af813c2", "metadata": {}, "outputs": [], @@ -429,7 +448,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 14, "id": "4020342e", "metadata": {}, "outputs": [ @@ -437,8 +456,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.5 s, sys: 903 ms, total: 2.41 s\n", - "Wall time: 9.33 s\n" + "CPU times: user 1.41 s, sys: 1.21 s, total: 2.63 s\n", + "Wall time: 9.21 s\n" ] } ], @@ -449,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "id": "bcf25c0d", "metadata": {}, "outputs": [ @@ -488,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 16, "id": "0aa813bc", "metadata": {}, "outputs": [ @@ -498,7 +517,7 @@ "\"pca__4{'copy': True, 'iterated_power': 'auto', 'n_components': 64, 'random_state': None, 'svd_solver': 'auto', 'tol': 0.0, 'whiten': False}=\\r\\nlogistic__1{'C': 0.046415888336127774, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 10000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.1, 'verbose': 0, 'warm_start': False}=pca__4{'copy': True, 'iterated_power': 'auto', 'n_components': 64, 'random_state': None, 'svd_solver': 'auto', 'tol': 0.0, 'whiten': False} \\r\\n\"" ] }, - "execution_count": 20, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -509,7 +528,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 17, "id": "d1b0b7f5", "metadata": {}, "outputs": [ @@ -519,7 +538,7 @@ "0.9260058805323429" ] }, - "execution_count": 21, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -530,11 +549,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "7db8d362", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Currently, we are in process of making more parity of the CF pipeline grid search API to provide similar\n", + "# methods as that of SKLearn Gridsearch APIs." + ] }, { "cell_type": "code", @@ -546,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "id": "877e3f63", "metadata": {}, "outputs": [], @@ -580,7 +602,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "id": "195c0d15", "metadata": {}, "outputs": [ @@ -590,8 +612,8 @@ "text": [ "Best parameter (CV score=0.920):\n", "{'logistic__C': 0.046415888336127774, 'pca__n_components': 45}\n", - "CPU times: user 1min 40s, sys: 39.5 s, total: 2min 19s\n", - "Wall time: 20.4 s\n" + "CPU times: user 1min 31s, sys: 46.2 s, total: 2min 17s\n", + "Wall time: 21.1 s\n" ] } ], @@ -601,6 +623,16 @@ "print(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\n", "print(search.best_params_)" ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c568665a", + "metadata": {}, + "outputs": [], + "source": [ + "ray.shutdown()" + ] } ], "metadata": { diff --git a/notebooks/Ray graph experiments.ipynb b/notebooks/Ray graph experiments.ipynb deleted file mode 100644 index 8f8aa3a..0000000 --- a/notebooks/Ray graph experiments.ipynb +++ /dev/null @@ -1,269 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "israeli-batch", - "metadata": {}, - "outputs": [], - "source": [ - "import ray" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "loose-projector", - "metadata": {}, - "outputs": [], - "source": [ - "ray.shutdown()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "renewable-western", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2021-06-04 10:55:36,533\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" - ] - }, - { - "data": { - "text/plain": [ - "{'node_ip_address': '192.168.1.37',\n", - " 'raylet_ip_address': '192.168.1.37',\n", - " 'redis_address': '192.168.1.37:6379',\n", - " 'object_store_address': '/tmp/ray/session_2021-06-04_10-55-35_103935_66596/sockets/plasma_store',\n", - " 'raylet_socket_name': '/tmp/ray/session_2021-06-04_10-55-35_103935_66596/sockets/raylet',\n", - " 'webui_url': '127.0.0.1:8265',\n", - " 'session_dir': '/tmp/ray/session_2021-06-04_10-55-35_103935_66596',\n", - " 'metrics_export_port': 62153,\n", - " 'node_id': 'b55f2340f68d5439d9728c6605bd1ae90b31d893b63c92dea02643fd'}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ray.init()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "scheduled-miami", - "metadata": {}, - "outputs": [], - "source": [ - "%config IPCompleter.use_jedi = False" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "leading-sheriff", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from sklearn.preprocessing import FunctionTransformer\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "\n", - "transformer = FunctionTransformer(np.log1p)\n", - "minmax_scaler = MinMaxScaler()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "9adf2a27", - "metadata": {}, - "outputs": [], - "source": [ - "import codeflare.pipelines.Datamodel as dm\n", - "\n", - "class FeatureUnion(dm.AndEstimator):\n", - " def __init__(self):\n", - " pass\n", - " \n", - " def fit_transform(self, xy_list: list):\n", - " return self.transform(xy_list)\n", - "\n", - " def get_estimator_type(self):\n", - " return 'transform'\n", - " \n", - " def transform(self, xy_list):\n", - " X_list = []\n", - " y_list = []\n", - " \n", - " for xy in xy_list:\n", - " X_list.append(xy.get_x())\n", - " X_concat = np.concatenate(X_list, axis=0)\n", - " \n", - " return dm.Xy(X_concat, None)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "olive-usage", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline = dm.Pipeline()\n", - "\n", - "node_a = dm.EstimatorNode('a', minmax_scaler)\n", - "node_b = dm.EstimatorNode('b', minmax_scaler)\n", - "node_c = dm.AndNode('c', FeatureUnion())\n", - "# node_d = dm.OrNode('d', transformer)\n", - "\n", - "pipeline.add_edge(node_a, node_c)\n", - "pipeline.add_edge(node_b, node_c)\n", - "# pipeline.add_edge(node_b, node_d)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "behind-dairy", - "metadata": {}, - "outputs": [], - "source": [ - "X = np.array([0, 1, 2, 3])\n", - "X = np.reshape(X, (4, 1))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "uniform-coordinator", - "metadata": {}, - "outputs": [], - "source": [ - "import codeflare.pipelines.Datamodel as dm" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "46de1312", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_input = dm.PipelineInput()\n", - "xy = dm.Xy(X, None)\n", - "pipeline_input.add_xy_arg(node_a, xy)\n", - "pipeline_input.add_xy_arg(node_b, xy)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "handy-offset", - "metadata": {}, - "outputs": [], - "source": [ - "import codeflare.pipelines.Runtime as rt" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "julian-clerk", - "metadata": {}, - "outputs": [], - "source": [ - "from codeflare.pipelines.Runtime import ExecutionType" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "specialized-health", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "e963bdd4", - "metadata": {}, - "outputs": [], - "source": [ - "out_Xyrefs = pipeline_output.get_xyrefs(node_c)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "living-destiny", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0. ]\n", - " [0.33333333]\n", - " [0.66666667]\n", - " [1. ]\n", - " [0. ]\n", - " [0.33333333]\n", - " [0.66666667]\n", - " [1. ]]\n", - "FeatureUnion()\n" - ] - } - ], - "source": [ - "for out_xyref in out_Xyrefs:\n", - " x = ray.get(out_xyref.get_Xref())\n", - " and_func = ray.get(out_xyref.get_curr_node_state_ref()).get_estimator()\n", - " print(x)\n", - " print(and_func)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0188d6f", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/lale_cross_val_score.ipynb b/notebooks/lale_cross_val_score.ipynb index 6515ee3..b59cfd1 100644 --- a/notebooks/lale_cross_val_score.ipynb +++ b/notebooks/lale_cross_val_score.ipynb @@ -1,142 +1,150 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from lale.datasets.openml import fetch" + "## LALE cross validation example\n", + "This example is in collaboration with the [LALE team](https://github.com/IBM/lale), which demonstrates how a LALE pipeline can be translated into a CodeFlare pipeline, targeting cross validation.\n", + "\n", + "It assumes that LALE is available and installed in your local environment.\n", + "\n", + "One can see from running this notebook that LALE cross validation is single threaded and takes ~10minutes on a laptop (depending on the configuration), whereas using CodeFlare pipelines running on Ray, this time is reduced to around 75 seconds (with 8x parallelism) for the 10 fold cross validation." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "(X_train, y_train), (X_test, y_test) = fetch(\"jungle_chess_2pcs_raw_endgame_complete\", \"classification\")" + "# Uncomment below to install lale for running this notebook\n", + "\n", + "# !pip install lale\n", + "# !pip install 'liac-arff>=2.4.0'" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "import ray\n", - "ray.shutdown()" + "from lale.datasets.openml import fetch" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# from lale.helpers import cross_val_score" + "(X_train, y_train), (X_test, y_test) = fetch(\"jungle_chess_2pcs_raw_endgame_complete\", \"classification\")" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# sk_pipeline = pipeline.export_to_sklearn_pipeline()" + "# First, we will show how this data can be used to do cross validation using a simple pipeline with random forest\n", + "from lale.lib.sklearn import PCA, Nystroem, SelectKBest, RandomForestClassifier\n", + "from lale.lib.lale import ConcatFeatures\n", + "\n", + "pipeline = (PCA() & Nystroem() & SelectKBest(k=3)) >> ConcatFeatures() >> RandomForestClassifier(n_estimators=200)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 7min 18s, sys: 8.91 s, total: 7min 27s\n", + "Wall time: 6min 59s\n" + ] + }, + { + "data": { + "text/plain": [ + "[0.8161838161838162,\n", + " 0.8105228105228105,\n", + " 0.8148518148518149,\n", + " 0.8218448218448219,\n", + " 0.8208458208458208,\n", + " 0.8111888111888111,\n", + " 0.8105228105228105,\n", + " 0.8181818181818182,\n", + " 0.8011325782811459,\n", + " 0.8121252498334444]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# sk_pipeline" + "%%time\n", + "from lale.helpers import cross_val_score\n", + "cross_val_score(pipeline, X_train, y_train, cv=10)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2021-05-26 11:34:55,429\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8267\u001b[39m\u001b[22m\n" + "2021-06-19 21:02:55,145\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266\u001b[39m\u001b[22m\n" ] }, { "data": { "text/plain": [ - "{'node_ip_address': '9.211.114.84',\n", - " 'raylet_ip_address': '9.211.114.84',\n", - " 'redis_address': '9.211.114.84:20725',\n", - " 'object_store_address': '/tmp/ray/session_2021-05-26_11-34-53_711763_35035/sockets/plasma_store',\n", - " 'raylet_socket_name': '/tmp/ray/session_2021-05-26_11-34-53_711763_35035/sockets/raylet',\n", - " 'webui_url': '127.0.0.1:8267',\n", - " 'session_dir': '/tmp/ray/session_2021-05-26_11-34-53_711763_35035',\n", - " 'metrics_export_port': 60055,\n", - " 'node_id': '6f77fe1897704e0f2f8967852a1ccd077086efebfb0634ec3ca3d130'}" + "{'node_ip_address': '9.211.53.245',\n", + " 'raylet_ip_address': '9.211.53.245',\n", + " 'redis_address': '9.211.53.245:29680',\n", + " 'object_store_address': '/tmp/ray/session_2021-06-19_21-02-53_442708_86627/sockets/plasma_store',\n", + " 'raylet_socket_name': '/tmp/ray/session_2021-06-19_21-02-53_442708_86627/sockets/raylet',\n", + " 'webui_url': '127.0.0.1:8266',\n", + " 'session_dir': '/tmp/ray/session_2021-06-19_21-02-53_442708_86627',\n", + " 'metrics_export_port': 61863,\n", + " 'node_id': 'eff8f3d5558252aa2d18c57b62891d1a169a7404979a63b6c9882a14'}" ] }, - "execution_count": 7, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# Start Ray and init\n", + "\n", "import ray\n", "ray.init(object_store_memory=16 * 1024 * 1024 * 1024)" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import codeflare.pipelines.Datamodel as dm" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import KFold, StratifiedKFold" - ] - }, - { - "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ + "from sklearn.model_selection import KFold, StratifiedKFold\n", "kf = StratifiedKFold(n_splits=10)" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline = dm.Pipeline()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -149,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -162,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -171,26 +179,23 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ + "import codeflare.pipelines.Datamodel as dm\n", + "\n", + "# Create the CF pipeline and the nodes, add the edge\n", + "pipeline = dm.Pipeline()\n", "node_fu = dm.EstimatorNode('feature_union', feature_union)\n", - "node_rf = dm.EstimatorNode('randomforest', random_forest)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ + "node_rf = dm.EstimatorNode('randomforest', random_forest)\n", + "\n", "pipeline.add_edge(node_fu, node_rf)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -200,7 +205,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -211,15 +216,15 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.39 s, sys: 1.64 s, total: 5.02 s\n", - "Wall time: 1min 22s\n" + "CPU times: user 3.06 s, sys: 2.11 s, total: 5.17 s\n", + "Wall time: 1min 15s\n" ] } ], @@ -230,25 +235,25 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.8185148185148186,\n", + "[0.8161838161838162,\n", + " 0.8118548118548119,\n", + " 0.8145188145188145,\n", + " 0.8198468198468198,\n", " 0.8175158175158175,\n", - " 0.8128538128538129,\n", - " 0.8195138195138195,\n", - " 0.8228438228438228,\n", - " 0.8168498168498168,\n", - " 0.8131868131868132,\n", - " 0.8161838161838162,\n", - " 0.7991339107261826,\n", - " 0.8077948034643571]" + " 0.8111888111888111,\n", + " 0.8158508158508159,\n", + " 0.8171828171828172,\n", + " 0.8037974683544303,\n", + " 0.8087941372418388]" ] }, - "execution_count": 21, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -259,61 +264,11 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 24, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "from lale.lib.sklearn import PCA, Nystroem, SelectKBest, RandomForestClassifier\n", - "from lale.lib.lale import ConcatFeatures\n", - "\n", - "pipeline = (PCA() & Nystroem() & SelectKBest(k=3)) >> ConcatFeatures() >> RandomForestClassifier(n_estimators=200)\n", - "# pipeline.visualize()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 7min 39s, sys: 13.1 s, total: 7min 52s\n", - "Wall time: 7min 13s\n" - ] - }, - { - "data": { - "text/plain": [ - "[0.8185148185148186,\n", - " 0.8101898101898102,\n", - " 0.8148518148518149,\n", - " 0.8201798201798202,\n", - " 0.8145188145188145,\n", - " 0.8185148185148186,\n", - " 0.8168498168498168,\n", - " 0.8125208125208125,\n", - " 0.7978014656895404,\n", - " 0.8114590273151232]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%time\n", - "from lale.helpers import cross_val_score\n", - "cross_val_score(pipeline, X_train, y_train, cv=10)" + "ray.shutdown()" ] }, { diff --git a/notebooks/sample_pipeline.ipynb b/notebooks/sample_pipeline.ipynb index 9b1928f..594b42f 100644 --- a/notebooks/sample_pipeline.ipynb +++ b/notebooks/sample_pipeline.ipynb @@ -1,8 +1,17 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "0c3cf197", + "metadata": {}, + "source": [ + "## Sample pipeline\n", + "This is a sample pipeline drawn from a competition posted on [Kaggle](https://www.kaggle.com/ragharamya/loanprediction). A preprocessor followed by exploring multiple options in parallel is demonstrated below." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "described-lover", "metadata": {}, "outputs": [], @@ -12,10 +21,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "simplified-summit", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Gender object\n", + "Married object\n", + "Dependents object\n", + "Education object\n", + "Self_Employed object\n", + "ApplicantIncome int64\n", + "CoapplicantIncome float64\n", + "LoanAmount float64\n", + "Loan_Amount_Term float64\n", + "Credit_History float64\n", + "Property_Area object\n", + "Loan_Status object\n", + "dtype: object" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "train = pd.read_csv('../resources/data/train_ctrUa4K.csv')\n", @@ -26,30 +58,31 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "configured-clinton", - "metadata": {}, - "outputs": [], - "source": [ - "train.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "confident-union", "metadata": {}, "outputs": [], "source": [ + "# prepare the dataset for training\n", + "\n", "X = train.drop('Loan_Status', axis=1)\n", "y = train['Loan_Status']\n", "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" ] }, + { + "cell_type": "markdown", + "id": "5f1793c1", + "metadata": {}, + "source": [ + "## SKLearn pipeline\n", + "Below, we show how SKLearn is used to create a pipeline and then fit for each of the pipelines to explore multiple models." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "visible-compact", "metadata": {}, "outputs": [], @@ -75,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "stupid-miracle", "metadata": {}, "outputs": [], @@ -91,22 +124,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "accredited-japan", - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "start = time.time()\n", - "Xt = preprocessor.fit(X_train)\n", - "end = time.time()\n", - "print('Time taken: ' + str(end - start))" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "fundamental-builder", "metadata": {}, "outputs": [], @@ -118,47 +136,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "ahead-narrow", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.base import ClassifierMixin\n", - "from sklearn.base import BaseEstimator\n", - "\n", - "class ScaleTestEstimator(ClassifierMixin, BaseEstimator):\n", - " num_iters = 100\n", - " classifier : ClassifierMixin = None\n", - "\n", - " def __init__(self, num_iters, classifier: ClassifierMixin):\n", - " self.num_iters = num_iters\n", - " self.classifier = classifier\n", - "\n", - " def fit(self, X, y):\n", - " for i in range(self.num_iters):\n", - " self.classifier.fit(X, y)\n", - " return self\n", - " \n", - " def predict(self, X):\n", - " return self.classifier.predict(X)\n", - "\n", - " def score(self, X, y, sample_weight=None):\n", - " return self.classifier.score(X, y, sample_weight)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "specialized-provider", - "metadata": {}, - "outputs": [], - "source": [ - "Xt = preprocessor.fit_transform(X_train)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "above-masters", "metadata": {}, "outputs": [], @@ -174,106 +152,38 @@ " KNeighborsClassifier(3),\n", " SVC(kernel=\"rbf\", C=0.025, probability=True),\n", " NuSVC(probability=True),\n", - " DecisionTreeClassifier(),\n", " RandomForestClassifier(),\n", - " AdaBoostClassifier(),\n", " GradientBoostingClassifier()\n", " ]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "bacterial-morocco", - "metadata": {}, - "outputs": [], - "source": [ - "classifiers[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "sudden-british", - "metadata": {}, - "outputs": [], - "source": [ - "c_a = ScaleTestEstimator(50, DecisionTreeClassifier())\n", - "c_b = ScaleTestEstimator(50, RandomForestClassifier())\n", - "c_c = ScaleTestEstimator(50, GradientBoostingClassifier())\n", - "classifiers = [c_a, c_b, c_c]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "imposed-practice", - "metadata": {}, - "outputs": [], - "source": [ - "import sklearn.base as base" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "tired-breast", - "metadata": {}, - "outputs": [], - "source": [ - "base.is_classifier(c_a)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "overall-review", - "metadata": {}, - "outputs": [], - "source": [ - "base.clone(c_a)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "greatest-cancellation", "metadata": {}, "outputs": [], "source": [ - "import time\n", - "start = time.time()\n", - "\n", - "c_a = ScaleTestEstimator(50, DecisionTreeClassifier())\n", - "c_b = ScaleTestEstimator(50, RandomForestClassifier())\n", - "c_c = ScaleTestEstimator(50, GradientBoostingClassifier())\n", - "classifiers = [c_a, c_b, c_c]\n", - "\n", "classifier_results=[]\n", "for classifier in classifiers:\n", " pipe = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('classifier', classifier)])\n", " pipe.fit(X_train, y_train)\n", - " pipe.predict(X_train)\n", - " \n", - "end = time.time()\n", - "tt = end - start\n", - "print('time taken: ' + str(tt))" + " pipe.predict(X_train)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "a1757296", + "cell_type": "markdown", + "id": "fef872c4", "metadata": {}, - "outputs": [], "source": [ - "c_a.classifier.feature_importances_" + "## CodeFlare pipelines\n", + "Below, we show how this can be done with CodeFlare pipelines approach." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "coordinate-gossip", "metadata": {}, "outputs": [], @@ -284,17 +194,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "bfe20bd8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2021-06-20 11:53:04,117\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + ] + }, + { + "data": { + "text/plain": [ + "{'node_ip_address': '9.211.53.245',\n", + " 'raylet_ip_address': '9.211.53.245',\n", + " 'redis_address': '9.211.53.245:6379',\n", + " 'object_store_address': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928/sockets/plasma_store',\n", + " 'raylet_socket_name': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928/sockets/raylet',\n", + " 'webui_url': '127.0.0.1:8265',\n", + " 'session_dir': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928',\n", + " 'metrics_export_port': 64339,\n", + " 'node_id': '8c45750634c387b09b787fb7a0aa191df63f6d5274861bad27c00cb3'}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ray.init()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "invisible-consensus", "metadata": {}, "outputs": [], @@ -304,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "surface-recruitment", "metadata": {}, "outputs": [], @@ -314,32 +250,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "humanitarian-boards", "metadata": {}, "outputs": [], "source": [ "node_a = dm.EstimatorNode('preprocess', preprocessor)\n", - "node_b = dm.EstimatorNode('c_a', c_a)\n", - "node_c = dm.EstimatorNode('c_b', c_b)\n", - "node_d = dm.EstimatorNode('c_c', c_c)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "popular-bookmark", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline.add_edge(node_a, node_b)\n", - "pipeline.add_edge(node_a, node_c)\n", - "pipeline.add_edge(node_a, node_d)" + "node_0 = dm.EstimatorNode('node_0', classifiers[0])\n", + "node_1 = dm.EstimatorNode('node_1', classifiers[1])\n", + "node_2 = dm.EstimatorNode('node_2', classifiers[2])\n", + "node_3 = dm.EstimatorNode('node_3', classifiers[3])\n", + "node_4 = dm.EstimatorNode('node_4', classifiers[4])\n", + "\n", + "pipeline.add_edge(node_a, node_0)\n", + "pipeline.add_edge(node_a, node_1)\n", + "pipeline.add_edge(node_a, node_2)\n", + "pipeline.add_edge(node_a, node_3)\n", + "pipeline.add_edge(node_a, node_4)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "ef9ff37b", "metadata": {}, "outputs": [], @@ -352,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "mineral-analyst", "metadata": {}, "outputs": [], @@ -362,7 +294,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "literary-consolidation", "metadata": {}, "outputs": [], @@ -372,51 +304,102 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "educated-basement", "metadata": {}, "outputs": [], "source": [ - "start = time.time()\n", - "\n", "pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)\n", - "\n", - "node_b_output = pipeline_output.get_xyrefs(node_b)\n", - "node_c_output = pipeline_output.get_xyrefs(node_c)\n", - "node_d_output = pipeline_output.get_xyrefs(node_d)\n", - "\n", - "end = time.time()\n", - "print ('Time taken: ' + str(end - start))" + "node_0_output = pipeline_output.get_xyrefs(node_0)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "be9e1c3a", + "execution_count": 37, + "id": "4df7fb29", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "node_b_output" + "outputs[0]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "9e153862", + "execution_count": 39, + "id": "41955886", "metadata": {}, "outputs": [], "source": [ - "selected_pipeline = rt.select_pipeline(pipeline_output, node_b_output[0])" + "X_out = ray.get(outputs[0][0].get_Xref())" ] }, { "cell_type": "code", - "execution_count": null, - "id": "c612e1b0", + "execution_count": 40, + "id": "7e05451f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n", + " 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',\n", + " 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N',\n", + " 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n", + " 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',\n", + " 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N',\n", + " 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',\n", + " 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n", + " 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',\n", + " 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',\n", + " 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n", + " 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n", + " 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y'], dtype=object)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "print(selected_pipeline)" + "X_out" ] } ], @@ -439,7 +422,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.7.9" } }, "nbformat": 4,