example of a PREDICT after FIT

Kun-Lung Wu · Kun-Lung Wu · commit f3e3e54928b4 · 2021-06-02T09:39:39.000-04:00
diff --git a/notebooks/plot_feature_selection_pipeline.ipynb b/notebooks/plot_feature_selection_pipeline.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Pipeline ANOVA SVM\n",
+    "\n",
+    "This example shows how a feature selection can be easily integrated within\n",
+    "a machine learning pipeline.\n",
+    "\n",
+    "We also show that you can easily introspect part of the pipeline.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Automatically created module for IPython interactive environment\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.92      0.80      0.86        15\n",
+      "           1       0.75      0.90      0.82        10\n",
+      "\n",
+      "    accuracy                           0.84        25\n",
+      "   macro avg       0.84      0.85      0.84        25\n",
+      "weighted avg       0.85      0.84      0.84        25\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[0.        , 0.        , 0.75791043, 0.        , 0.        ,\n",
+       "        0.        , 0.        , 0.        , 0.        , 0.27158921,\n",
+       "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
+       "        0.        , 0.        , 0.        , 0.        , 0.26109702]])"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(__doc__)\n",
+    "\n",
+    "from sklearn import set_config\n",
+    "set_config(display='diagram')\n",
+    "from sklearn.datasets import make_classification\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X, y = make_classification(\n",
+    "    n_features=20, n_informative=3, n_redundant=0, n_classes=2,\n",
+    "    n_clusters_per_class=2, random_state=42)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
+    "\n",
+    "from sklearn.feature_selection import SelectKBest, f_classif\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.svm import LinearSVC\n",
+    "\n",
+    "anova_filter = SelectKBest(f_classif, k=3)\n",
+    "clf = LinearSVC()\n",
+    "anova_svm = make_pipeline(anova_filter, clf)\n",
+    "anova_svm.fit(X_train, y_train)\n",
+    "\n",
+    "from sklearn.metrics import classification_report\n",
+    "\n",
+    "y_pred = anova_svm.predict(X_test)\n",
+    "print(classification_report(y_test, y_pred))\n",
+    "\n",
+    "anova_svm[-1].coef_\n",
+    "\n",
+    "anova_svm[:-1].inverse_transform(anova_svm[-1].coef_)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-06-02 09:20:27,751\tINFO services.py:1267 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266\u001b[39m\u001b[22m\n"
+     ]
+    },
+    {
+     "ename": "RayTaskError(ValueError)",
+     "evalue": "\u001b[36mray::execute_or_node_remote()\u001b[39m (pid=29747, ip=192.168.1.5)\n  File \"python/ray/_raylet.pyx\", line 505, in ray._raylet.execute_task\n  File \"/opt/anaconda3/lib/python3.8/site-packages/codeflare_pipelines-1.0.0-py3.8.egg/codeflare/pipelines/Runtime.py\", line 23, in execute_or_node_remote\n  File \"/opt/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 47, in wrapper\n    return func(*args, **kwargs)\nValueError: 'object_refs' must either be an object ref or a list of object refs.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRayTaskError(ValueError)\u001b[0m                  Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-11-5c12286d6a83>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     50\u001b[0m \u001b[0mpredict_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute_pipeline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselected_pipeline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExecutionType\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPREDICT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpipeline_input\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     51\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 52\u001b[0;31m \u001b[0mpredict_clf_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpredict_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_xyrefs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnode_clf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     53\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredict_clf_output\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_yref\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/codeflare_pipelines-1.0.0-py3.8.egg/codeflare/pipelines/Datamodel.py\u001b[0m in \u001b[0;36mget_xyrefs\u001b[0;34m(self, node)\u001b[0m\n\u001b[1;32m    397\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mpe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPipelineNodeNotFoundException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Node \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnode\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\" not found\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    398\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 399\u001b[0;31m         \u001b[0mxyrefs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxyrefs_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    400\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mxyrefs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    401\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     45\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mclient_mode_should_convert\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     46\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     49\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/ray/worker.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(object_refs, timeout)\u001b[0m\n\u001b[1;32m   1479\u001b[0m                     \u001b[0mworker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore_worker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdump_object_store_memory_usage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1480\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRayTaskError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1481\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_instanceof_cause\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1482\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1483\u001b[0m                     \u001b[0;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mRayTaskError(ValueError)\u001b[0m: \u001b[36mray::execute_or_node_remote()\u001b[39m (pid=29747, ip=192.168.1.5)\n  File \"python/ray/_raylet.pyx\", line 505, in ray._raylet.execute_task\n  File \"/opt/anaconda3/lib/python3.8/site-packages/codeflare_pipelines-1.0.0-py3.8.egg/codeflare/pipelines/Runtime.py\", line 23, in execute_or_node_remote\n  File \"/opt/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 47, in wrapper\n    return func(*args, **kwargs)\nValueError: 'object_refs' must either be an object ref or a list of object refs."
+     ]
+    }
+   ],
+   "source": [
+    "import ray\n",
+    "import codeflare.pipelines.Datamodel as dm\n",
+    "import codeflare.pipelines.Runtime as rt\n",
+    "from codeflare.pipelines.Datamodel import Xy\n",
+    "from codeflare.pipelines.Datamodel import XYRef\n",
+    "from codeflare.pipelines.Runtime import ExecutionType\n",
+    "\n",
+    "ray.shutdown()\n",
+    "ray.init()\n",
+    "\n",
+    "from sklearn import set_config\n",
+    "set_config(display='diagram')\n",
+    "from sklearn.datasets import make_classification\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X, y = make_classification(\n",
+    "    n_features=20, n_informative=3, n_redundant=0, n_classes=2,\n",
+    "    n_clusters_per_class=2, random_state=42)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
+    "\n",
+    "from sklearn.feature_selection import SelectKBest, f_classif\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.svm import LinearSVC\n",
+    "\n",
+    "anova_filter = SelectKBest(f_classif, k=3)\n",
+    "clf = LinearSVC()\n",
+    "\n",
+    "pipeline = dm.Pipeline()\n",
+    "node_anova_filter = dm.EstimatorNode('anova_filter', anova_filter)\n",
+    "node_clf = dm.EstimatorNode('clf', clf)\n",
+    "pipeline.add_edge(node_anova_filter, node_clf)\n",
+    "\n",
+    "pipeline_input = dm.PipelineInput()\n",
+    "xy = dm.Xy(X_train, y_train)\n",
+    "\n",
+    "pipeline_input.add_xy_arg(node_anova_filter, xy)\n",
+    "\n",
+    "pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)\n",
+    "\n",
+    "node_clf_output = pipeline_output.get_xyrefs(node_clf)\n",
+    "\n",
+    "Xout = ray.get(node_clf_output[0].get_Xref())\n",
+    "yout = ray.get(node_clf_output[0].get_yref())\n",
+    "\n",
+    "selected_pipeline = rt.select_pipeline(pipeline_output, node_clf_output[0])\n",
+    "\n",
+    "pipeline_input = dm.PipelineInput()\n",
+    "pipeline_input.add_xy_arg(node_anova_filter, dm.Xy(X_test, y_test))\n",
+    "\n",
+    "predict_output = rt.execute_pipeline(selected_pipeline, ExecutionType.PREDICT, pipeline_input)\n",
+    "\n",
+    "predict_clf_output = predict_output.get_xyrefs(node_clf)\n",
+    "y_pred = ray.get(predict_clf_output[0].get_yref())\n",
+    "\n",
+    "from sklearn.metrics import classification_report\n",
+    "\n",
+    "#y_pred = anova_svm.predict(X_test)\n",
+    "print(classification_report(y_test, y_pred))\n",
+    "\n",
+    "#anova_svm[-1].coef_\n",
+    "\n",
+    "#anova_svm[:-1].inverse_transform(anova_svm[-1].coef_)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}