# Express sklearn pipeline as codeflare pipeline
Reference: https://scikit-learn.org/stable/auto_examples/semi_supervised/plot_semi_supervised_newsgroups.html#sphx-glr-auto-examples-semi-supervised-plot-semi-supervised-newsgroups-py

In [1]:
%matplotlib inline


# Semi-supervised Classification on a Text Dataset

In this example, semi-supervised classifiers are trained on the 20 newsgroups
dataset (which will be automatically downloaded).

You can adjust the number of categories by giving their names to the dataset
loader or setting them to `None` to get all 20 of them.


In [2]:
import os

import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import f1_score

data = fetch_20newsgroups(subset='train', categories=None)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

# Parameters
sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# Supervised Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(**vectorizer_params)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(**sdg_params)),
])
# SelfTraining Pipeline
st_pipeline = Pipeline([
    ('vect', CountVectorizer(**vectorizer_params)),
    ('tfidf', TfidfTransformer()),
    ('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
])
# LabelSpreading Pipeline
ls_pipeline = Pipeline([
    ('vect', CountVectorizer(**vectorizer_params)),
    ('tfidf', TfidfTransformer()),
    # LabelSpreading does not support dense matrices
    ('todense', FunctionTransformer(lambda x: x.todense())),
    ('clf', LabelSpreading()),
])


def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:",
          sum(1 for x in y_train if x == -1))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Micro-averaged F1 score on test set: "
          "%0.3f" % f1_score(y_test, y_pred, average='micro'))
    print("-" * 10)
    print()


if __name__ == "__main__":
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    print("Supervised SGDClassifier on 100% of the data:")
    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)

    # select a mask of 20% of the train dataset
    y_mask = np.random.rand(len(y_train)) < 0.2

    # X_20 and y_20 are the subset of the train dataset indicated by the mask
    X_20, y_20 = map(list, zip(*((x, y)
                     for x, y, m in zip(X_train, y_train, y_mask) if m)))
    print("Supervised SGDClassifier on 20% of the training data:")
    eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)

    # set the non-masked subset to be unlabeled
    y_train[~y_mask] = -1
    print("SelfTrainingClassifier on 20% of the training data (rest "
          "is unlabeled):")
    eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)

    if 'CI' not in os.environ:
        # LabelSpreading takes too long to run in the online documentation
        print("LabelSpreading on 20% of the data (rest is unlabeled):")
        eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)

11314 documents
20 categories

Supervised SGDClassifier on 100% of the data:
Number of training samples: 8485
Unlabeled samples in training set: 0
Micro-averaged F1 score on test set: 0.901
----------

Supervised SGDClassifier on 20% of the training data:
Number of training samples: 1692
Unlabeled samples in training set: 0
Micro-averaged F1 score on test set: 0.786
----------

SelfTrainingClassifier on 20% of the training data (rest is unlabeled):
Number of training samples: 8485
Unlabeled samples in training set: 6793
End of iteration 1, added 2875 new labels.
End of iteration 2, added 681 new labels.
End of iteration 3, added 234 new labels.
End of iteration 4, added 84 new labels.
End of iteration 5, added 29 new labels.
End of iteration 6, added 11 new labels.
End of iteration 7, added 9 new labels.
End of iteration 8, added 2 new labels.
End of iteration 9, added 4 new labels.
End of iteration 10, added 7 new labels.
Micro-averaged F1 score on test set: 0.834
----------

LabelSpr

In [5]:
import ray
import codeflare.pipelines.Datamodel as dm
import codeflare.pipelines.Runtime as rt
from codeflare.pipelines.Datamodel import Xy
from codeflare.pipelines.Datamodel import XYRef
from codeflare.pipelines.Runtime import ExecutionType

import os

import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import f1_score

data = fetch_20newsgroups(subset='train', categories=None)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

# Parameters
sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# Supervised Pipeline
pipeline = dm.Pipeline()

vect = CountVectorizer(**vectorizer_params)
tfidf = TfidfTransformer()
clf1 = SGDClassifier(**sdg_params)
clf2 = SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)
todense = FunctionTransformer(lambda x: x.todense())
clf3 = LabelSpreading() 

node_vect = dm.EstimatorNode('vect', CountVectorizer(**vectorizer_params))
node_tfidf = dm.EstimatorNode('tfidf', TfidfTransformer())
node_clf1 = dm.EstimatorNode('clf1', SGDClassifier(**sdg_params))
node_clf2 = dm.EstimatorNode('clf2', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True))
node_todense = dm.EstimatorNode('todense', FunctionTransformer(lambda x: x.todense()))
node_clf3 = dm.EstimatorNode('clf3', LabelSpreading())

pipeline.add_edge(node_vect, node_tfidf)
# Supervised Pipeline
pipeline.add_edge(node_tfidf, node_clf1)
# SelfTraining Pipeline
pipeline.add_edge(node_tfidf, node_clf2)
# LabelSpreading Pipeline
pipeline.add_edge(node_tfidf, node_todense)
pipeline.add_edge(node_todense, node_clf3)


def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:",
          sum(1 for x in y_train if x == -1))
    
    pipeline_input = dm.PipelineInput()
    pipeline_input.add_xy_arg(node_vect, dm.Xy(X_train, y_train))
    
    # execute FIT
    pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)
    
    # select a pipeline referenced by clf
    selected_pipeline = rt.select_pipeline(pipeline_output, clf[0])
    
    # execute PREDICT
    pipeline_input = dm.PipelineInput()
    pipeline_input.add_xy_arg(node_vect, dm.Xy(X_test, y_test))
    predict_output = rt.execute_pipeline(selected_pipeline, ExecutionType.PREDICT, pipeline_input)
    
    predict_clf_output = predict_output.get_xyrefs(clf)
    y_pred = ray.get(predict_clf_output[0].get_yref())
    
    print("Micro-averaged F1 score on test set: "
          "%0.3f" % f1_score(y_test, y_pred, average='micro'))
    print("-" * 10)
    print()


if __name__ == "__main__":
    
    ray.shutdown()
    ray.init()
    
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    print("Supervised SGDClassifier on 100% of the data:")
    eval_and_print_metrics(node_clf1, X_train, y_train, X_test, y_test)

    # select a mask of 20% of the train dataset
    y_mask = np.random.rand(len(y_train)) < 0.2

    # X_20 and y_20 are the subset of the train dataset indicated by the mask
    X_20, y_20 = map(list, zip(*((x, y)
                     for x, y, m in zip(X_train, y_train, y_mask) if m)))
    print("Supervised SGDClassifier on 20% of the training data:")
    eval_and_print_metrics(node_clf1, X_20, y_20, X_test, y_test)

    # set the non-masked subset to be unlabeled
    y_train[~y_mask] = -1
    print("SelfTrainingClassifier on 20% of the training data (rest "
          "is unlabeled):")
    eval_and_print_metrics(node_clf2, X_train, y_train, X_test, y_test)

    if 'CI' not in os.environ:
        # LabelSpreading takes too long to run in the online documentation
        print("LabelSpreading on 20% of the data (rest is unlabeled):")
        eval_and_print_metrics(node_clf3, X_train, y_train, X_test, y_test)
        
    ray.shutdown()

11314 documents
20 categories



2021-06-01 10:55:34,397	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


Supervised SGDClassifier on 100% of the data:
Number of training samples: 8485
Unlabeled samples in training set: 0


RayTaskError(ValueError): [36mray::execute_or_node_remote()[39m (pid=5002, ip=192.168.1.230)
  File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
  File "/Users/yuanchi/anaconda3/lib/python3.8/site-packages/codeflare_pipelines-1.0.0-py3.8.egg/codeflare/pipelines/Runtime.py", line 43, in execute_or_node_remote
  File "/Users/yuanchi/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 702, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/Users/yuanchi/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py", line 1477, in transform
    X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
  File "/Users/yuanchi/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Users/yuanchi/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 593, in check_array
    array = _ensure_sparse_format(array, accept_sparse=accept_sparse,
  File "/Users/yuanchi/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 381, in _ensure_sparse_format
    spmatrix = spmatrix.astype(dtype)
  File "/Users/yuanchi/anaconda3/lib/python3.8/site-packages/scipy/sparse/data.py", line 72, in astype
    self._deduped_data().astype(dtype, casting=casting, copy=copy),
  File "/Users/yuanchi/anaconda3/lib/python3.8/site-packages/scipy/sparse/data.py", line 32, in _deduped_data
    self.sum_duplicates()
  File "/Users/yuanchi/anaconda3/lib/python3.8/site-packages/scipy/sparse/compressed.py", line 1098, in sum_duplicates
    self.sort_indices()
  File "/Users/yuanchi/anaconda3/lib/python3.8/site-packages/scipy/sparse/compressed.py", line 1144, in sort_indices
    _sparsetools.csr_sort_indices(len(self.indptr) - 1, self.indptr,
ValueError: WRITEBACKIFCOPY base is read-only