adding plot_semi_supervised_newsgroups

yuanchi2807 · yuanchi2807 · commit 9e5fe263fb3c · 2021-06-01T10:08:07.000-04:00
diff --git a/notebooks/plot_semi_supervised_newsgroups.ipynb b/notebooks/plot_semi_supervised_newsgroups.ipynb
@@ -0,0 +1,297 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Express sklearn pipeline as codeflare pipeline\n",
+    "Reference: https://scikit-learn.org/stable/auto_examples/semi_supervised/plot_semi_supervised_newsgroups.html#sphx-glr-auto-examples-semi-supervised-plot-semi-supervised-newsgroups-py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Semi-supervised Classification on a Text Dataset\n",
+    "\n",
+    "In this example, semi-supervised classifiers are trained on the 20 newsgroups\n",
+    "dataset (which will be automatically downloaded).\n",
+    "\n",
+    "You can adjust the number of categories by giving their names to the dataset\n",
+    "loader or setting them to `None` to get all 20 of them.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "11314 documents\n",
+      "20 categories\n",
+      "\n",
+      "Supervised SGDClassifier on 100% of the data:\n",
+      "Number of training samples: 8485\n",
+      "Unlabeled samples in training set: 0\n",
+      "Micro-averaged F1 score on test set: 0.901\n",
+      "----------\n",
+      "\n",
+      "Supervised SGDClassifier on 20% of the training data:\n",
+      "Number of training samples: 1692\n",
+      "Unlabeled samples in training set: 0\n",
+      "Micro-averaged F1 score on test set: 0.786\n",
+      "----------\n",
+      "\n",
+      "SelfTrainingClassifier on 20% of the training data (rest is unlabeled):\n",
+      "Number of training samples: 8485\n",
+      "Unlabeled samples in training set: 6793\n",
+      "End of iteration 1, added 2875 new labels.\n",
+      "End of iteration 2, added 681 new labels.\n",
+      "End of iteration 3, added 234 new labels.\n",
+      "End of iteration 4, added 84 new labels.\n",
+      "End of iteration 5, added 29 new labels.\n",
+      "End of iteration 6, added 11 new labels.\n",
+      "End of iteration 7, added 9 new labels.\n",
+      "End of iteration 8, added 2 new labels.\n",
+      "End of iteration 9, added 4 new labels.\n",
+      "End of iteration 10, added 7 new labels.\n",
+      "Micro-averaged F1 score on test set: 0.834\n",
+      "----------\n",
+      "\n",
+      "LabelSpreading on 20% of the data (rest is unlabeled):\n",
+      "Number of training samples: 8485\n",
+      "Unlabeled samples in training set: 6793\n",
+      "Micro-averaged F1 score on test set: 0.652\n",
+      "----------\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.feature_extraction.text import TfidfTransformer\n",
+    "from sklearn.preprocessing import FunctionTransformer\n",
+    "from sklearn.linear_model import SGDClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.semi_supervised import SelfTrainingClassifier\n",
+    "from sklearn.semi_supervised import LabelSpreading\n",
+    "from sklearn.metrics import f1_score\n",
+    "\n",
+    "data = fetch_20newsgroups(subset='train', categories=None)\n",
+    "print(\"%d documents\" % len(data.filenames))\n",
+    "print(\"%d categories\" % len(data.target_names))\n",
+    "print()\n",
+    "\n",
+    "# Parameters\n",
+    "sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')\n",
+    "vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)\n",
+    "\n",
+    "# Supervised Pipeline\n",
+    "pipeline = Pipeline([\n",
+    "    ('vect', CountVectorizer(**vectorizer_params)),\n",
+    "    ('tfidf', TfidfTransformer()),\n",
+    "    ('clf', SGDClassifier(**sdg_params)),\n",
+    "])\n",
+    "# SelfTraining Pipeline\n",
+    "st_pipeline = Pipeline([\n",
+    "    ('vect', CountVectorizer(**vectorizer_params)),\n",
+    "    ('tfidf', TfidfTransformer()),\n",
+    "    ('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),\n",
+    "])\n",
+    "# LabelSpreading Pipeline\n",
+    "ls_pipeline = Pipeline([\n",
+    "    ('vect', CountVectorizer(**vectorizer_params)),\n",
+    "    ('tfidf', TfidfTransformer()),\n",
+    "    # LabelSpreading does not support dense matrices\n",
+    "    ('todense', FunctionTransformer(lambda x: x.todense())),\n",
+    "    ('clf', LabelSpreading()),\n",
+    "])\n",
+    "\n",
+    "\n",
+    "def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):\n",
+    "    print(\"Number of training samples:\", len(X_train))\n",
+    "    print(\"Unlabeled samples in training set:\",\n",
+    "          sum(1 for x in y_train if x == -1))\n",
+    "    clf.fit(X_train, y_train)\n",
+    "    y_pred = clf.predict(X_test)\n",
+    "    print(\"Micro-averaged F1 score on test set: \"\n",
+    "          \"%0.3f\" % f1_score(y_test, y_pred, average='micro'))\n",
+    "    print(\"-\" * 10)\n",
+    "    print()\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    X, y = data.data, data.target\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
+    "\n",
+    "    print(\"Supervised SGDClassifier on 100% of the data:\")\n",
+    "    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)\n",
+    "\n",
+    "    # select a mask of 20% of the train dataset\n",
+    "    y_mask = np.random.rand(len(y_train)) < 0.2\n",
+    "\n",
+    "    # X_20 and y_20 are the subset of the train dataset indicated by the mask\n",
+    "    X_20, y_20 = map(list, zip(*((x, y)\n",
+    "                     for x, y, m in zip(X_train, y_train, y_mask) if m)))\n",
+    "    print(\"Supervised SGDClassifier on 20% of the training data:\")\n",
+    "    eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)\n",
+    "\n",
+    "    # set the non-masked subset to be unlabeled\n",
+    "    y_train[~y_mask] = -1\n",
+    "    print(\"SelfTrainingClassifier on 20% of the training data (rest \"\n",
+    "          \"is unlabeled):\")\n",
+    "    eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)\n",
+    "\n",
+    "    if 'CI' not in os.environ:\n",
+    "        # LabelSpreading takes too long to run in the online documentation\n",
+    "        print(\"LabelSpreading on 20% of the data (rest is unlabeled):\")\n",
+    "        eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ray\n",
+    "import codeflare.pipelines.Datamodel as dm\n",
+    "import codeflare.pipelines.Runtime as rt\n",
+    "from codeflare.pipelines.Datamodel import Xy\n",
+    "from codeflare.pipelines.Datamodel import XYRef\n",
+    "from codeflare.pipelines.Runtime import ExecutionType\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.feature_extraction.text import TfidfTransformer\n",
+    "from sklearn.preprocessing import FunctionTransformer\n",
+    "from sklearn.linear_model import SGDClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.semi_supervised import SelfTrainingClassifier\n",
+    "from sklearn.semi_supervised import LabelSpreading\n",
+    "from sklearn.metrics import f1_score\n",
+    "\n",
+    "data = fetch_20newsgroups(subset='train', categories=None)\n",
+    "print(\"%d documents\" % len(data.filenames))\n",
+    "print(\"%d categories\" % len(data.target_names))\n",
+    "print()\n",
+    "\n",
+    "# Parameters\n",
+    "sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')\n",
+    "vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)\n",
+    "\n",
+    "# Supervised Pipeline\n",
+    "pipeline = Pipeline([\n",
+    "    ('vect', CountVectorizer(**vectorizer_params)),\n",
+    "    ('tfidf', TfidfTransformer()),\n",
+    "    ('clf', SGDClassifier(**sdg_params)),\n",
+    "])\n",
+    "# SelfTraining Pipeline\n",
+    "st_pipeline = Pipeline([\n",
+    "    ('vect', CountVectorizer(**vectorizer_params)),\n",
+    "    ('tfidf', TfidfTransformer()),\n",
+    "    ('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),\n",
+    "])\n",
+    "# LabelSpreading Pipeline\n",
+    "ls_pipeline = Pipeline([\n",
+    "    ('vect', CountVectorizer(**vectorizer_params)),\n",
+    "    ('tfidf', TfidfTransformer()),\n",
+    "    # LabelSpreading does not support dense matrices\n",
+    "    ('todense', FunctionTransformer(lambda x: x.todense())),\n",
+    "    ('clf', LabelSpreading()),\n",
+    "])\n",
+    "\n",
+    "\n",
+    "def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):\n",
+    "    print(\"Number of training samples:\", len(X_train))\n",
+    "    print(\"Unlabeled samples in training set:\",\n",
+    "          sum(1 for x in y_train if x == -1))\n",
+    "    clf.fit(X_train, y_train)\n",
+    "    y_pred = clf.predict(X_test)\n",
+    "    print(\"Micro-averaged F1 score on test set: \"\n",
+    "          \"%0.3f\" % f1_score(y_test, y_pred, average='micro'))\n",
+    "    print(\"-\" * 10)\n",
+    "    print()\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    \n",
+    "    ray.shutdown()\n",
+    "    ray.init()\n",
+    "    \n",
+    "    X, y = data.data, data.target\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
+    "\n",
+    "    print(\"Supervised SGDClassifier on 100% of the data:\")\n",
+    "    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)\n",
+    "\n",
+    "    # select a mask of 20% of the train dataset\n",
+    "    y_mask = np.random.rand(len(y_train)) < 0.2\n",
+    "\n",
+    "    # X_20 and y_20 are the subset of the train dataset indicated by the mask\n",
+    "    X_20, y_20 = map(list, zip(*((x, y)\n",
+    "                     for x, y, m in zip(X_train, y_train, y_mask) if m)))\n",
+    "    print(\"Supervised SGDClassifier on 20% of the training data:\")\n",
+    "    eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)\n",
+    "\n",
+    "    # set the non-masked subset to be unlabeled\n",
+    "    y_train[~y_mask] = -1\n",
+    "    print(\"SelfTrainingClassifier on 20% of the training data (rest \"\n",
+    "          \"is unlabeled):\")\n",
+    "    eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)\n",
+    "\n",
+    "    if 'CI' not in os.environ:\n",
+    "        # LabelSpreading takes too long to run in the online documentation\n",
+    "        print(\"LabelSpreading on 20% of the data (rest is unlabeled):\")\n",
+    "        eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)\n",
+    "        \n",
+    "    ray.shutdown()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}