Skip to content

Commit 9e5fe26

Browse files
committed
adding plot_semi_supervised_newsgroups
1 parent 3742748 commit 9e5fe26

File tree

1 file changed

+297
-0
lines changed

1 file changed

+297
-0
lines changed
Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Express sklearn pipeline as codeflare pipeline\n",
8+
"Reference: https://scikit-learn.org/stable/auto_examples/semi_supervised/plot_semi_supervised_newsgroups.html#sphx-glr-auto-examples-semi-supervised-plot-semi-supervised-newsgroups-py"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": 1,
14+
"metadata": {},
15+
"outputs": [],
16+
"source": [
17+
"%matplotlib inline"
18+
]
19+
},
20+
{
21+
"cell_type": "markdown",
22+
"metadata": {},
23+
"source": [
24+
"\n",
25+
"# Semi-supervised Classification on a Text Dataset\n",
26+
"\n",
27+
"In this example, semi-supervised classifiers are trained on the 20 newsgroups\n",
28+
"dataset (which will be automatically downloaded).\n",
29+
"\n",
30+
"You can adjust the number of categories by giving their names to the dataset\n",
31+
"loader or setting them to `None` to get all 20 of them.\n"
32+
]
33+
},
34+
{
35+
"cell_type": "code",
36+
"execution_count": 2,
37+
"metadata": {},
38+
"outputs": [
39+
{
40+
"name": "stdout",
41+
"output_type": "stream",
42+
"text": [
43+
"11314 documents\n",
44+
"20 categories\n",
45+
"\n",
46+
"Supervised SGDClassifier on 100% of the data:\n",
47+
"Number of training samples: 8485\n",
48+
"Unlabeled samples in training set: 0\n",
49+
"Micro-averaged F1 score on test set: 0.901\n",
50+
"----------\n",
51+
"\n",
52+
"Supervised SGDClassifier on 20% of the training data:\n",
53+
"Number of training samples: 1692\n",
54+
"Unlabeled samples in training set: 0\n",
55+
"Micro-averaged F1 score on test set: 0.786\n",
56+
"----------\n",
57+
"\n",
58+
"SelfTrainingClassifier on 20% of the training data (rest is unlabeled):\n",
59+
"Number of training samples: 8485\n",
60+
"Unlabeled samples in training set: 6793\n",
61+
"End of iteration 1, added 2875 new labels.\n",
62+
"End of iteration 2, added 681 new labels.\n",
63+
"End of iteration 3, added 234 new labels.\n",
64+
"End of iteration 4, added 84 new labels.\n",
65+
"End of iteration 5, added 29 new labels.\n",
66+
"End of iteration 6, added 11 new labels.\n",
67+
"End of iteration 7, added 9 new labels.\n",
68+
"End of iteration 8, added 2 new labels.\n",
69+
"End of iteration 9, added 4 new labels.\n",
70+
"End of iteration 10, added 7 new labels.\n",
71+
"Micro-averaged F1 score on test set: 0.834\n",
72+
"----------\n",
73+
"\n",
74+
"LabelSpreading on 20% of the data (rest is unlabeled):\n",
75+
"Number of training samples: 8485\n",
76+
"Unlabeled samples in training set: 6793\n",
77+
"Micro-averaged F1 score on test set: 0.652\n",
78+
"----------\n",
79+
"\n"
80+
]
81+
}
82+
],
83+
"source": [
84+
"import os\n",
85+
"\n",
86+
"import numpy as np\n",
87+
"\n",
88+
"from sklearn.datasets import fetch_20newsgroups\n",
89+
"from sklearn.feature_extraction.text import CountVectorizer\n",
90+
"from sklearn.feature_extraction.text import TfidfTransformer\n",
91+
"from sklearn.preprocessing import FunctionTransformer\n",
92+
"from sklearn.linear_model import SGDClassifier\n",
93+
"from sklearn.model_selection import train_test_split\n",
94+
"from sklearn.pipeline import Pipeline\n",
95+
"from sklearn.semi_supervised import SelfTrainingClassifier\n",
96+
"from sklearn.semi_supervised import LabelSpreading\n",
97+
"from sklearn.metrics import f1_score\n",
98+
"\n",
99+
"data = fetch_20newsgroups(subset='train', categories=None)\n",
100+
"print(\"%d documents\" % len(data.filenames))\n",
101+
"print(\"%d categories\" % len(data.target_names))\n",
102+
"print()\n",
103+
"\n",
104+
"# Parameters\n",
105+
"sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')\n",
106+
"vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)\n",
107+
"\n",
108+
"# Supervised Pipeline\n",
109+
"pipeline = Pipeline([\n",
110+
" ('vect', CountVectorizer(**vectorizer_params)),\n",
111+
" ('tfidf', TfidfTransformer()),\n",
112+
" ('clf', SGDClassifier(**sdg_params)),\n",
113+
"])\n",
114+
"# SelfTraining Pipeline\n",
115+
"st_pipeline = Pipeline([\n",
116+
" ('vect', CountVectorizer(**vectorizer_params)),\n",
117+
" ('tfidf', TfidfTransformer()),\n",
118+
" ('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),\n",
119+
"])\n",
120+
"# LabelSpreading Pipeline\n",
121+
"ls_pipeline = Pipeline([\n",
122+
" ('vect', CountVectorizer(**vectorizer_params)),\n",
123+
" ('tfidf', TfidfTransformer()),\n",
124+
" # LabelSpreading does not support dense matrices\n",
125+
" ('todense', FunctionTransformer(lambda x: x.todense())),\n",
126+
" ('clf', LabelSpreading()),\n",
127+
"])\n",
128+
"\n",
129+
"\n",
130+
"def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):\n",
131+
" print(\"Number of training samples:\", len(X_train))\n",
132+
" print(\"Unlabeled samples in training set:\",\n",
133+
" sum(1 for x in y_train if x == -1))\n",
134+
" clf.fit(X_train, y_train)\n",
135+
" y_pred = clf.predict(X_test)\n",
136+
" print(\"Micro-averaged F1 score on test set: \"\n",
137+
" \"%0.3f\" % f1_score(y_test, y_pred, average='micro'))\n",
138+
" print(\"-\" * 10)\n",
139+
" print()\n",
140+
"\n",
141+
"\n",
142+
"if __name__ == \"__main__\":\n",
143+
" X, y = data.data, data.target\n",
144+
" X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
145+
"\n",
146+
" print(\"Supervised SGDClassifier on 100% of the data:\")\n",
147+
" eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)\n",
148+
"\n",
149+
" # select a mask of 20% of the train dataset\n",
150+
" y_mask = np.random.rand(len(y_train)) < 0.2\n",
151+
"\n",
152+
" # X_20 and y_20 are the subset of the train dataset indicated by the mask\n",
153+
" X_20, y_20 = map(list, zip(*((x, y)\n",
154+
" for x, y, m in zip(X_train, y_train, y_mask) if m)))\n",
155+
" print(\"Supervised SGDClassifier on 20% of the training data:\")\n",
156+
" eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)\n",
157+
"\n",
158+
" # set the non-masked subset to be unlabeled\n",
159+
" y_train[~y_mask] = -1\n",
160+
" print(\"SelfTrainingClassifier on 20% of the training data (rest \"\n",
161+
" \"is unlabeled):\")\n",
162+
" eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)\n",
163+
"\n",
164+
" if 'CI' not in os.environ:\n",
165+
" # LabelSpreading takes too long to run in the online documentation\n",
166+
" print(\"LabelSpreading on 20% of the data (rest is unlabeled):\")\n",
167+
" eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)"
168+
]
169+
},
170+
{
171+
"cell_type": "code",
172+
"execution_count": null,
173+
"metadata": {},
174+
"outputs": [],
175+
"source": [
176+
"import ray\n",
177+
"import codeflare.pipelines.Datamodel as dm\n",
178+
"import codeflare.pipelines.Runtime as rt\n",
179+
"from codeflare.pipelines.Datamodel import Xy\n",
180+
"from codeflare.pipelines.Datamodel import XYRef\n",
181+
"from codeflare.pipelines.Runtime import ExecutionType\n",
182+
"\n",
183+
"import os\n",
184+
"\n",
185+
"import numpy as np\n",
186+
"\n",
187+
"from sklearn.datasets import fetch_20newsgroups\n",
188+
"from sklearn.feature_extraction.text import CountVectorizer\n",
189+
"from sklearn.feature_extraction.text import TfidfTransformer\n",
190+
"from sklearn.preprocessing import FunctionTransformer\n",
191+
"from sklearn.linear_model import SGDClassifier\n",
192+
"from sklearn.model_selection import train_test_split\n",
193+
"from sklearn.pipeline import Pipeline\n",
194+
"from sklearn.semi_supervised import SelfTrainingClassifier\n",
195+
"from sklearn.semi_supervised import LabelSpreading\n",
196+
"from sklearn.metrics import f1_score\n",
197+
"\n",
198+
"data = fetch_20newsgroups(subset='train', categories=None)\n",
199+
"print(\"%d documents\" % len(data.filenames))\n",
200+
"print(\"%d categories\" % len(data.target_names))\n",
201+
"print()\n",
202+
"\n",
203+
"# Parameters\n",
204+
"sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')\n",
205+
"vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)\n",
206+
"\n",
207+
"# Supervised Pipeline\n",
208+
"pipeline = Pipeline([\n",
209+
" ('vect', CountVectorizer(**vectorizer_params)),\n",
210+
" ('tfidf', TfidfTransformer()),\n",
211+
" ('clf', SGDClassifier(**sdg_params)),\n",
212+
"])\n",
213+
"# SelfTraining Pipeline\n",
214+
"st_pipeline = Pipeline([\n",
215+
" ('vect', CountVectorizer(**vectorizer_params)),\n",
216+
" ('tfidf', TfidfTransformer()),\n",
217+
" ('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),\n",
218+
"])\n",
219+
"# LabelSpreading Pipeline\n",
220+
"ls_pipeline = Pipeline([\n",
221+
" ('vect', CountVectorizer(**vectorizer_params)),\n",
222+
" ('tfidf', TfidfTransformer()),\n",
223+
" # LabelSpreading does not support dense matrices\n",
224+
" ('todense', FunctionTransformer(lambda x: x.todense())),\n",
225+
" ('clf', LabelSpreading()),\n",
226+
"])\n",
227+
"\n",
228+
"\n",
229+
"def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):\n",
230+
" print(\"Number of training samples:\", len(X_train))\n",
231+
" print(\"Unlabeled samples in training set:\",\n",
232+
" sum(1 for x in y_train if x == -1))\n",
233+
" clf.fit(X_train, y_train)\n",
234+
" y_pred = clf.predict(X_test)\n",
235+
" print(\"Micro-averaged F1 score on test set: \"\n",
236+
" \"%0.3f\" % f1_score(y_test, y_pred, average='micro'))\n",
237+
" print(\"-\" * 10)\n",
238+
" print()\n",
239+
"\n",
240+
"\n",
241+
"if __name__ == \"__main__\":\n",
242+
" \n",
243+
" ray.shutdown()\n",
244+
" ray.init()\n",
245+
" \n",
246+
" X, y = data.data, data.target\n",
247+
" X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
248+
"\n",
249+
" print(\"Supervised SGDClassifier on 100% of the data:\")\n",
250+
" eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)\n",
251+
"\n",
252+
" # select a mask of 20% of the train dataset\n",
253+
" y_mask = np.random.rand(len(y_train)) < 0.2\n",
254+
"\n",
255+
" # X_20 and y_20 are the subset of the train dataset indicated by the mask\n",
256+
" X_20, y_20 = map(list, zip(*((x, y)\n",
257+
" for x, y, m in zip(X_train, y_train, y_mask) if m)))\n",
258+
" print(\"Supervised SGDClassifier on 20% of the training data:\")\n",
259+
" eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)\n",
260+
"\n",
261+
" # set the non-masked subset to be unlabeled\n",
262+
" y_train[~y_mask] = -1\n",
263+
" print(\"SelfTrainingClassifier on 20% of the training data (rest \"\n",
264+
" \"is unlabeled):\")\n",
265+
" eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)\n",
266+
"\n",
267+
" if 'CI' not in os.environ:\n",
268+
" # LabelSpreading takes too long to run in the online documentation\n",
269+
" print(\"LabelSpreading on 20% of the data (rest is unlabeled):\")\n",
270+
" eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)\n",
271+
" \n",
272+
" ray.shutdown()"
273+
]
274+
}
275+
],
276+
"metadata": {
277+
"kernelspec": {
278+
"display_name": "Python 3",
279+
"language": "python",
280+
"name": "python3"
281+
},
282+
"language_info": {
283+
"codemirror_mode": {
284+
"name": "ipython",
285+
"version": 3
286+
},
287+
"file_extension": ".py",
288+
"mimetype": "text/x-python",
289+
"name": "python",
290+
"nbconvert_exporter": "python",
291+
"pygments_lexer": "ipython3",
292+
"version": "3.8.8"
293+
}
294+
},
295+
"nbformat": 4,
296+
"nbformat_minor": 1
297+
}

0 commit comments

Comments
 (0)