Join GitHub today
GitHub is home to over 50 million developers working together to host and review code, manage projects, and build software together.
Sign up| """Metrics to assess performance on classification task given class prediction | |
| Functions named as ``*_score`` return a scalar value to maximize: the higher | |
| the better | |
| Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: | |
| the lower the better | |
| """ | |
| # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr> | |
| # Mathieu Blondel <mathieu@mblondel.org> | |
| # Olivier Grisel <olivier.grisel@ensta.org> | |
| # Arnaud Joly <a.joly@ulg.ac.be> | |
| # Jochen Wersdorfer <jochen@wersdoerfer.de> | |
| # Lars Buitinck | |
| # Joel Nothman <joel.nothman@gmail.com> | |
| # Noel Dawe <noel@dawe.me> | |
| # Jatin Shah <jatindshah@gmail.com> | |
| # Saurabh Jha <saurabh.jhaa@gmail.com> | |
| # Bernardo Stein <bernardovstein@gmail.com> | |
| # Shangwu Yao <shangwuyao@gmail.com> | |
| # License: BSD 3 clause | |
| import warnings | |
| import numpy as np | |
| from scipy.sparse import coo_matrix | |
| from scipy.sparse import csr_matrix | |
| from ..preprocessing import LabelBinarizer | |
| from ..preprocessing import LabelEncoder | |
| from ..utils import assert_all_finite | |
| from ..utils import check_array | |
| from ..utils import check_consistent_length | |
| from ..utils import column_or_1d | |
| from ..utils.multiclass import unique_labels | |
| from ..utils.multiclass import type_of_target | |
| from ..utils.validation import _num_samples | |
| from ..utils.sparsefuncs import count_nonzero | |
| from ..exceptions import UndefinedMetricWarning | |
| def _check_zero_division(zero_division): | |
| if isinstance(zero_division, str) and zero_division == "warn": | |
| return | |
| elif isinstance(zero_division, (int, float)) and zero_division in [0, 1]: | |
| return | |
| raise ValueError('Got zero_division={0}.' | |
| ' Must be one of ["warn", 0, 1]'.format(zero_division)) | |
| def _check_targets(y_true, y_pred): | |
| """Check that y_true and y_pred belong to the same classification task | |
| This converts multiclass or binary types to a common shape, and raises a | |
| ValueError for a mix of multilabel and multiclass targets, a mix of | |
| multilabel formats, for the presence of continuous-valued or multioutput | |
| targets, or for targets of different lengths. | |
| Column vectors are squeezed to 1d, while multilabel formats are returned | |
| as CSR sparse label indicators. | |
| Parameters | |
| ---------- | |
| y_true : array-like | |
| y_pred : array-like | |
| Returns | |
| ------- | |
| type_true : one of {'multilabel-indicator', 'multiclass', 'binary'} | |
| The type of the true target data, as output by | |
| ``utils.multiclass.type_of_target`` | |
| y_true : array or indicator matrix | |
| y_pred : array or indicator matrix | |
| """ | |
| check_consistent_length(y_true, y_pred) | |
| type_true = type_of_target(y_true) | |
| type_pred = type_of_target(y_pred) | |
| y_type = {type_true, type_pred} | |
| if y_type == {"binary", "multiclass"}: | |
| y_type = {"multiclass"} | |
| if len(y_type) > 1: | |
| raise ValueError("Classification metrics can't handle a mix of {0} " | |
| "and {1} targets".format(type_true, type_pred)) | |
| # We can't have more than one value on y_type => The set is no more needed | |
| y_type = y_type.pop() | |
| # No metrics support "multiclass-multioutput" format | |
| if (y_type not in ["binary", "multiclass", "multilabel-indicator"]): | |
| raise ValueError("{0} is not supported".format(y_type)) | |
| if y_type in ["binary", "multiclass"]: | |
| y_true = column_or_1d(y_true) | |
| y_pred = column_or_1d(y_pred) | |
| if y_type == "binary": | |
| unique_values = np.union1d(y_true, y_pred) | |
| if len(unique_values) > 2: | |
| y_type = "multiclass" | |
| if y_type.startswith('multilabel'): | |
| y_true = csr_matrix(y_true) | |
| y_pred = csr_matrix(y_pred) | |
| y_type = 'multilabel-indicator' | |
| return y_type, y_true, y_pred | |
| def _weighted_sum(sample_score, sample_weight, normalize=False): | |
| if normalize: | |
| return np.average(sample_score, weights=sample_weight) | |
| elif sample_weight is not None: | |
| return np.dot(sample_score, sample_weight) | |
| else: | |
| return sample_score.sum() | |
| def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None): | |
| """Accuracy classification score. | |
| In multilabel classification, this function computes subset accuracy: | |
| the set of labels predicted for a sample must *exactly* match the | |
| corresponding set of labels in y_true. | |
| Read more in the :ref:`User Guide <accuracy_score>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) labels. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Predicted labels, as returned by a classifier. | |
| normalize : bool, optional (default=True) | |
| If ``False``, return the number of correctly classified samples. | |
| Otherwise, return the fraction of correctly classified samples. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| Returns | |
| ------- | |
| score : float | |
| If ``normalize == True``, return the fraction of correctly | |
| classified samples (float), else returns the number of correctly | |
| classified samples (int). | |
| The best performance is 1 with ``normalize == True`` and the number | |
| of samples with ``normalize == False``. | |
| See also | |
| -------- | |
| jaccard_score, hamming_loss, zero_one_loss | |
| Notes | |
| ----- | |
| In binary and multiclass classification, this function is equal | |
| to the ``jaccard_score`` function. | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import accuracy_score | |
| >>> y_pred = [0, 2, 1, 3] | |
| >>> y_true = [0, 1, 2, 3] | |
| >>> accuracy_score(y_true, y_pred) | |
| 0.5 | |
| >>> accuracy_score(y_true, y_pred, normalize=False) | |
| 2 | |
| In the multilabel case with binary label indicators: | |
| >>> import numpy as np | |
| >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) | |
| 0.5 | |
| """ | |
| # Compute accuracy for each possible representation | |
| y_type, y_true, y_pred = _check_targets(y_true, y_pred) | |
| check_consistent_length(y_true, y_pred, sample_weight) | |
| if y_type.startswith('multilabel'): | |
| differing_labels = count_nonzero(y_true - y_pred, axis=1) | |
| score = differing_labels == 0 | |
| else: | |
| score = y_true == y_pred | |
| return _weighted_sum(score, sample_weight, normalize) | |
| def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None, | |
| normalize=None): | |
| """Compute confusion matrix to evaluate the accuracy of a classification. | |
| By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}` | |
| is equal to the number of observations known to be in group :math:`i` and | |
| predicted to be in group :math:`j`. | |
| Thus in binary classification, the count of true negatives is | |
| :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is | |
| :math:`C_{1,1}` and false positives is :math:`C_{0,1}`. | |
| Read more in the :ref:`User Guide <confusion_matrix>`. | |
| Parameters | |
| ---------- | |
| y_true : array-like of shape (n_samples,) | |
| Ground truth (correct) target values. | |
| y_pred : array-like of shape (n_samples,) | |
| Estimated targets as returned by a classifier. | |
| labels : array-like of shape (n_classes), default=None | |
| List of labels to index the matrix. This may be used to reorder | |
| or select a subset of labels. | |
| If ``None`` is given, those that appear at least once | |
| in ``y_true`` or ``y_pred`` are used in sorted order. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| normalize : {'true', 'pred', 'all'}, default=None | |
| Normalizes confusion matrix over the true (rows), predicted (columns) | |
| conditions or all the population. If None, confusion matrix will not be | |
| normalized. | |
| Returns | |
| ------- | |
| C : ndarray of shape (n_classes, n_classes) | |
| Confusion matrix. | |
| References | |
| ---------- | |
| .. [1] `Wikipedia entry for the Confusion matrix | |
| <https://en.wikipedia.org/wiki/Confusion_matrix>`_ | |
| (Wikipedia and other references may use a different | |
| convention for axes) | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import confusion_matrix | |
| >>> y_true = [2, 0, 2, 2, 0, 1] | |
| >>> y_pred = [0, 0, 2, 2, 0, 2] | |
| >>> confusion_matrix(y_true, y_pred) | |
| array([[2, 0, 0], | |
| [0, 0, 1], | |
| [1, 0, 2]]) | |
| >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] | |
| >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] | |
| >>> confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"]) | |
| array([[2, 0, 0], | |
| [0, 0, 1], | |
| [1, 0, 2]]) | |
| In the binary case, we can extract true positives, etc as follows: | |
| >>> tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel() | |
| >>> (tn, fp, fn, tp) | |
| (0, 2, 1, 1) | |
| """ | |
| y_type, y_true, y_pred = _check_targets(y_true, y_pred) | |
| if y_type not in ("binary", "multiclass"): | |
| raise ValueError("%s is not supported" % y_type) | |
| if labels is None: | |
| labels = unique_labels(y_true, y_pred) | |
| else: | |
| labels = np.asarray(labels) | |
| if np.all([l not in y_true for l in labels]): | |
| raise ValueError("At least one label specified must be in y_true") | |
| if sample_weight is None: | |
| sample_weight = np.ones(y_true.shape[0], dtype=np.int64) | |
| else: | |
| sample_weight = np.asarray(sample_weight) | |
| check_consistent_length(y_true, y_pred, sample_weight) | |
| if normalize not in ['true', 'pred', 'all', None]: | |
| raise ValueError("normalize must be one of {'true', 'pred', " | |
| "'all', None}") | |
| n_labels = labels.size | |
| label_to_ind = {y: x for x, y in enumerate(labels)} | |
| # convert yt, yp into index | |
| y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) | |
| y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) | |
| # intersect y_pred, y_true with labels, eliminate items not in labels | |
| ind = np.logical_and(y_pred < n_labels, y_true < n_labels) | |
| y_pred = y_pred[ind] | |
| y_true = y_true[ind] | |
| # also eliminate weights of eliminated items | |
| sample_weight = sample_weight[ind] | |
| # Choose the accumulator dtype to always have high precision | |
| if sample_weight.dtype.kind in {'i', 'u', 'b'}: | |
| dtype = np.int64 | |
| else: | |
| dtype = np.float64 | |
| cm = coo_matrix((sample_weight, (y_true, y_pred)), | |
| shape=(n_labels, n_labels), dtype=dtype, | |
| ).toarray() | |
| with np.errstate(all='ignore'): | |
| if normalize == 'true': | |
| cm = cm / cm.sum(axis=1, keepdims=True) | |
| elif normalize == 'pred': | |
| cm = cm / cm.sum(axis=0, keepdims=True) | |
| elif normalize == 'all': | |
| cm = cm / cm.sum() | |
| cm = np.nan_to_num(cm) | |
| return cm | |
| def multilabel_confusion_matrix(y_true, y_pred, sample_weight=None, | |
| labels=None, samplewise=False): | |
| """Compute a confusion matrix for each class or sample | |
| .. versionadded:: 0.21 | |
| Compute class-wise (default) or sample-wise (samplewise=True) multilabel | |
| confusion matrix to evaluate the accuracy of a classification, and output | |
| confusion matrices for each class or sample. | |
| In multilabel confusion matrix :math:`MCM`, the count of true negatives | |
| is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`, | |
| true positives is :math:`MCM_{:,1,1}` and false positives is | |
| :math:`MCM_{:,0,1}`. | |
| Multiclass data will be treated as if binarized under a one-vs-rest | |
| transformation. Returned confusion matrices will be in the order of | |
| sorted unique labels in the union of (y_true, y_pred). | |
| Read more in the :ref:`User Guide <multilabel_confusion_matrix>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| of shape (n_samples, n_outputs) or (n_samples,) | |
| Ground truth (correct) target values. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| of shape (n_samples, n_outputs) or (n_samples,) | |
| Estimated targets as returned by a classifier | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights | |
| labels : array-like | |
| A list of classes or column indices to select some (or to force | |
| inclusion of classes absent from the data) | |
| samplewise : bool, default=False | |
| In the multilabel case, this calculates a confusion matrix per sample | |
| Returns | |
| ------- | |
| multi_confusion : array, shape (n_outputs, 2, 2) | |
| A 2x2 confusion matrix corresponding to each output in the input. | |
| When calculating class-wise multi_confusion (default), then | |
| n_outputs = n_labels; when calculating sample-wise multi_confusion | |
| (samplewise=True), n_outputs = n_samples. If ``labels`` is defined, | |
| the results will be returned in the order specified in ``labels``, | |
| otherwise the results will be returned in sorted order by default. | |
| See also | |
| -------- | |
| confusion_matrix | |
| Notes | |
| ----- | |
| The multilabel_confusion_matrix calculates class-wise or sample-wise | |
| multilabel confusion matrices, and in multiclass tasks, labels are | |
| binarized under a one-vs-rest way; while confusion_matrix calculates | |
| one confusion matrix for confusion between every two classes. | |
| Examples | |
| -------- | |
| Multilabel-indicator case: | |
| >>> import numpy as np | |
| >>> from sklearn.metrics import multilabel_confusion_matrix | |
| >>> y_true = np.array([[1, 0, 1], | |
| ... [0, 1, 0]]) | |
| >>> y_pred = np.array([[1, 0, 0], | |
| ... [0, 1, 1]]) | |
| >>> multilabel_confusion_matrix(y_true, y_pred) | |
| array([[[1, 0], | |
| [0, 1]], | |
| <BLANKLINE> | |
| [[1, 0], | |
| [0, 1]], | |
| <BLANKLINE> | |
| [[0, 1], | |
| [1, 0]]]) | |
| Multiclass case: | |
| >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] | |
| >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] | |
| >>> multilabel_confusion_matrix(y_true, y_pred, | |
| ... labels=["ant", "bird", "cat"]) | |
| array([[[3, 1], | |
| [0, 2]], | |
| <BLANKLINE> | |
| [[5, 0], | |
| [1, 0]], | |
| <BLANKLINE> | |
| [[2, 1], | |
| [1, 2]]]) | |
| """ | |
| y_type, y_true, y_pred = _check_targets(y_true, y_pred) | |
| if sample_weight is not None: | |
| sample_weight = column_or_1d(sample_weight) | |
| check_consistent_length(y_true, y_pred, sample_weight) | |
| if y_type not in ("binary", "multiclass", "multilabel-indicator"): | |
| raise ValueError("%s is not supported" % y_type) | |
| present_labels = unique_labels(y_true, y_pred) | |
| if labels is None: | |
| labels = present_labels | |
| n_labels = None | |
| else: | |
| n_labels = len(labels) | |
| labels = np.hstack([labels, np.setdiff1d(present_labels, labels, | |
| assume_unique=True)]) | |
| if y_true.ndim == 1: | |
| if samplewise: | |
| raise ValueError("Samplewise metrics are not available outside of " | |
| "multilabel classification.") | |
| le = LabelEncoder() | |
| le.fit(labels) | |
| y_true = le.transform(y_true) | |
| y_pred = le.transform(y_pred) | |
| sorted_labels = le.classes_ | |
| # labels are now from 0 to len(labels) - 1 -> use bincount | |
| tp = y_true == y_pred | |
| tp_bins = y_true[tp] | |
| if sample_weight is not None: | |
| tp_bins_weights = np.asarray(sample_weight)[tp] | |
| else: | |
| tp_bins_weights = None | |
| if len(tp_bins): | |
| tp_sum = np.bincount(tp_bins, weights=tp_bins_weights, | |
| minlength=len(labels)) | |
| else: | |
| # Pathological case | |
| true_sum = pred_sum = tp_sum = np.zeros(len(labels)) | |
| if len(y_pred): | |
| pred_sum = np.bincount(y_pred, weights=sample_weight, | |
| minlength=len(labels)) | |
| if len(y_true): | |
| true_sum = np.bincount(y_true, weights=sample_weight, | |
| minlength=len(labels)) | |
| # Retain only selected labels | |
| indices = np.searchsorted(sorted_labels, labels[:n_labels]) | |
| tp_sum = tp_sum[indices] | |
| true_sum = true_sum[indices] | |
| pred_sum = pred_sum[indices] | |
| else: | |
| sum_axis = 1 if samplewise else 0 | |
| # All labels are index integers for multilabel. | |
| # Select labels: | |
| if not np.array_equal(labels, present_labels): | |
| if np.max(labels) > np.max(present_labels): | |
| raise ValueError('All labels must be in [0, n labels) for ' | |
| 'multilabel targets. ' | |
| 'Got %d > %d' % | |
| (np.max(labels), np.max(present_labels))) | |
| if np.min(labels) < 0: | |
| raise ValueError('All labels must be in [0, n labels) for ' | |
| 'multilabel targets. ' | |
| 'Got %d < 0' % np.min(labels)) | |
| if n_labels is not None: | |
| y_true = y_true[:, labels[:n_labels]] | |
| y_pred = y_pred[:, labels[:n_labels]] | |
| # calculate weighted counts | |
| true_and_pred = y_true.multiply(y_pred) | |
| tp_sum = count_nonzero(true_and_pred, axis=sum_axis, | |
| sample_weight=sample_weight) | |
| pred_sum = count_nonzero(y_pred, axis=sum_axis, | |
| sample_weight=sample_weight) | |
| true_sum = count_nonzero(y_true, axis=sum_axis, | |
| sample_weight=sample_weight) | |
| fp = pred_sum - tp_sum | |
| fn = true_sum - tp_sum | |
| tp = tp_sum | |
| if sample_weight is not None and samplewise: | |
| sample_weight = np.array(sample_weight) | |
| tp = np.array(tp) | |
| fp = np.array(fp) | |
| fn = np.array(fn) | |
| tn = sample_weight * y_true.shape[1] - tp - fp - fn | |
| elif sample_weight is not None: | |
| tn = sum(sample_weight) - tp - fp - fn | |
| elif samplewise: | |
| tn = y_true.shape[1] - tp - fp - fn | |
| else: | |
| tn = y_true.shape[0] - tp - fp - fn | |
| return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2) | |
| def cohen_kappa_score(y1, y2, labels=None, weights=None, sample_weight=None): | |
| r"""Cohen's kappa: a statistic that measures inter-annotator agreement. | |
| This function computes Cohen's kappa [1]_, a score that expresses the level | |
| of agreement between two annotators on a classification problem. It is | |
| defined as | |
| .. math:: | |
| \kappa = (p_o - p_e) / (1 - p_e) | |
| where :math:`p_o` is the empirical probability of agreement on the label | |
| assigned to any sample (the observed agreement ratio), and :math:`p_e` is | |
| the expected agreement when both annotators assign labels randomly. | |
| :math:`p_e` is estimated using a per-annotator empirical prior over the | |
| class labels [2]_. | |
| Read more in the :ref:`User Guide <cohen_kappa>`. | |
| Parameters | |
| ---------- | |
| y1 : array, shape = [n_samples] | |
| Labels assigned by the first annotator. | |
| y2 : array, shape = [n_samples] | |
| Labels assigned by the second annotator. The kappa statistic is | |
| symmetric, so swapping ``y1`` and ``y2`` doesn't change the value. | |
| labels : array, shape = [n_classes], optional | |
| List of labels to index the matrix. This may be used to select a | |
| subset of labels. If None, all labels that appear at least once in | |
| ``y1`` or ``y2`` are used. | |
| weights : str, optional | |
| Weighting type to calculate the score. None means no weighted; | |
| "linear" means linear weighted; "quadratic" means quadratic weighted. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| Returns | |
| ------- | |
| kappa : float | |
| The kappa statistic, which is a number between -1 and 1. The maximum | |
| value means complete agreement; zero or lower means chance agreement. | |
| References | |
| ---------- | |
| .. [1] J. Cohen (1960). "A coefficient of agreement for nominal scales". | |
| Educational and Psychological Measurement 20(1):37-46. | |
| doi:10.1177/001316446002000104. | |
| .. [2] `R. Artstein and M. Poesio (2008). "Inter-coder agreement for | |
| computational linguistics". Computational Linguistics 34(4):555-596. | |
| <https://www.mitpressjournals.org/doi/pdf/10.1162/coli.07-034-R2>`_ | |
| .. [3] `Wikipedia entry for the Cohen's kappa. | |
| <https://en.wikipedia.org/wiki/Cohen%27s_kappa>`_ | |
| """ | |
| confusion = confusion_matrix(y1, y2, labels=labels, | |
| sample_weight=sample_weight) | |
| n_classes = confusion.shape[0] | |
| sum0 = np.sum(confusion, axis=0) | |
| sum1 = np.sum(confusion, axis=1) | |
| expected = np.outer(sum0, sum1) / np.sum(sum0) | |
| if weights is None: | |
| w_mat = np.ones([n_classes, n_classes], dtype=np.int) | |
| w_mat.flat[:: n_classes + 1] = 0 | |
| elif weights == "linear" or weights == "quadratic": | |
| w_mat = np.zeros([n_classes, n_classes], dtype=np.int) | |
| w_mat += np.arange(n_classes) | |
| if weights == "linear": | |
| w_mat = np.abs(w_mat - w_mat.T) | |
| else: | |
| w_mat = (w_mat - w_mat.T) ** 2 | |
| else: | |
| raise ValueError("Unknown kappa weighting type.") | |
| k = np.sum(w_mat * confusion) / np.sum(w_mat * expected) | |
| return 1 - k | |
| def jaccard_similarity_score(y_true, y_pred, normalize=True, | |
| sample_weight=None): | |
| """Jaccard similarity coefficient score | |
| .. deprecated:: 0.21 | |
| This is deprecated to be removed in 0.23, since its handling of | |
| binary and multiclass inputs was broken. `jaccard_score` has an API | |
| that is consistent with precision_score, f_score, etc. | |
| Read more in the :ref:`User Guide <jaccard_similarity_score>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) labels. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Predicted labels, as returned by a classifier. | |
| normalize : bool, optional (default=True) | |
| If ``False``, return the sum of the Jaccard similarity coefficient | |
| over the sample set. Otherwise, return the average of Jaccard | |
| similarity coefficient. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| Returns | |
| ------- | |
| score : float | |
| If ``normalize == True``, return the average Jaccard similarity | |
| coefficient, else it returns the sum of the Jaccard similarity | |
| coefficient over the sample set. | |
| The best performance is 1 with ``normalize == True`` and the number | |
| of samples with ``normalize == False``. | |
| See also | |
| -------- | |
| accuracy_score, hamming_loss, zero_one_loss | |
| Notes | |
| ----- | |
| In binary and multiclass classification, this function is equivalent | |
| to the ``accuracy_score``. It differs in the multilabel classification | |
| problem. | |
| References | |
| ---------- | |
| .. [1] `Wikipedia entry for the Jaccard index | |
| <https://en.wikipedia.org/wiki/Jaccard_index>`_ | |
| """ | |
| warnings.warn('jaccard_similarity_score has been deprecated and replaced ' | |
| 'with jaccard_score. It will be removed in version 0.23. ' | |
| 'This implementation has surprising behavior for binary ' | |
| 'and multiclass classification tasks.', | |
| FutureWarning) | |
| # Compute accuracy for each possible representation | |
| y_type, y_true, y_pred = _check_targets(y_true, y_pred) | |
| check_consistent_length(y_true, y_pred, sample_weight) | |
| if y_type.startswith('multilabel'): | |
| with np.errstate(divide='ignore', invalid='ignore'): | |
| # oddly, we may get an "invalid" rather than a "divide" error here | |
| pred_or_true = count_nonzero(y_true + y_pred, axis=1) | |
| pred_and_true = count_nonzero(y_true.multiply(y_pred), axis=1) | |
| score = pred_and_true / pred_or_true | |
| score[pred_or_true == 0.0] = 1.0 | |
| else: | |
| score = y_true == y_pred | |
| return _weighted_sum(score, sample_weight, normalize) | |
| def jaccard_score(y_true, y_pred, labels=None, pos_label=1, | |
| average='binary', sample_weight=None): | |
| """Jaccard similarity coefficient score | |
| The Jaccard index [1], or Jaccard similarity coefficient, defined as | |
| the size of the intersection divided by the size of the union of two label | |
| sets, is used to compare set of predicted labels for a sample to the | |
| corresponding set of labels in ``y_true``. | |
| Read more in the :ref:`User Guide <jaccard_similarity_score>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) labels. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Predicted labels, as returned by a classifier. | |
| labels : list, optional | |
| The set of labels to include when ``average != 'binary'``, and their | |
| order if ``average is None``. Labels present in the data can be | |
| excluded, for example to calculate a multiclass average ignoring a | |
| majority negative class, while labels not present in the data will | |
| result in 0 components in a macro average. For multilabel targets, | |
| labels are column indices. By default, all labels in ``y_true`` and | |
| ``y_pred`` are used in sorted order. | |
| pos_label : str or int, 1 by default | |
| The class to report if ``average='binary'`` and the data is binary. | |
| If the data are multiclass or multilabel, this will be ignored; | |
| setting ``labels=[pos_label]`` and ``average != 'binary'`` will report | |
| scores for that label only. | |
| average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ | |
| 'weighted'] | |
| If ``None``, the scores for each class are returned. Otherwise, this | |
| determines the type of averaging performed on the data: | |
| ``'binary'``: | |
| Only report results for the class specified by ``pos_label``. | |
| This is applicable only if targets (``y_{true,pred}``) are binary. | |
| ``'micro'``: | |
| Calculate metrics globally by counting the total true positives, | |
| false negatives and false positives. | |
| ``'macro'``: | |
| Calculate metrics for each label, and find their unweighted | |
| mean. This does not take label imbalance into account. | |
| ``'weighted'``: | |
| Calculate metrics for each label, and find their average, weighted | |
| by support (the number of true instances for each label). This | |
| alters 'macro' to account for label imbalance. | |
| ``'samples'``: | |
| Calculate metrics for each instance, and find their average (only | |
| meaningful for multilabel classification). | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| Returns | |
| ------- | |
| score : float (if average is not None) or array of floats, shape =\ | |
| [n_unique_labels] | |
| See also | |
| -------- | |
| accuracy_score, f_score, multilabel_confusion_matrix | |
| Notes | |
| ----- | |
| :func:`jaccard_score` may be a poor metric if there are no | |
| positives for some samples or classes. Jaccard is undefined if there are | |
| no true or predicted labels, and our implementation will return a score | |
| of 0 with a warning. | |
| References | |
| ---------- | |
| .. [1] `Wikipedia entry for the Jaccard index | |
| <https://en.wikipedia.org/wiki/Jaccard_index>`_ | |
| Examples | |
| -------- | |
| >>> import numpy as np | |
| >>> from sklearn.metrics import jaccard_score | |
| >>> y_true = np.array([[0, 1, 1], | |
| ... [1, 1, 0]]) | |
| >>> y_pred = np.array([[1, 1, 1], | |
| ... [1, 0, 0]]) | |
| In the binary case: | |
| >>> jaccard_score(y_true[0], y_pred[0]) | |
| 0.6666... | |
| In the multilabel case: | |
| >>> jaccard_score(y_true, y_pred, average='samples') | |
| 0.5833... | |
| >>> jaccard_score(y_true, y_pred, average='macro') | |
| 0.6666... | |
| >>> jaccard_score(y_true, y_pred, average=None) | |
| array([0.5, 0.5, 1. ]) | |
| In the multiclass case: | |
| >>> y_pred = [0, 2, 1, 2] | |
| >>> y_true = [0, 1, 2, 2] | |
| >>> jaccard_score(y_true, y_pred, average=None) | |
| array([1. , 0. , 0.33...]) | |
| """ | |
| labels = _check_set_wise_labels(y_true, y_pred, average, labels, | |
| pos_label) | |
| samplewise = average == 'samples' | |
| MCM = multilabel_confusion_matrix(y_true, y_pred, | |
| sample_weight=sample_weight, | |
| labels=labels, samplewise=samplewise) | |
| numerator = MCM[:, 1, 1] | |
| denominator = MCM[:, 1, 1] + MCM[:, 0, 1] + MCM[:, 1, 0] | |
| if average == 'micro': | |
| numerator = np.array([numerator.sum()]) | |
| denominator = np.array([denominator.sum()]) | |
| jaccard = _prf_divide(numerator, denominator, 'jaccard', | |
| 'true or predicted', average, ('jaccard',)) | |
| if average is None: | |
| return jaccard | |
| if average == 'weighted': | |
| weights = MCM[:, 1, 0] + MCM[:, 1, 1] | |
| if not np.any(weights): | |
| # numerator is 0, and warning should have already been issued | |
| weights = None | |
| elif average == 'samples' and sample_weight is not None: | |
| weights = sample_weight | |
| else: | |
| weights = None | |
| return np.average(jaccard, weights=weights) | |
| def matthews_corrcoef(y_true, y_pred, sample_weight=None): | |
| """Compute the Matthews correlation coefficient (MCC) | |
| The Matthews correlation coefficient is used in machine learning as a | |
| measure of the quality of binary and multiclass classifications. It takes | |
| into account true and false positives and negatives and is generally | |
| regarded as a balanced measure which can be used even if the classes are of | |
| very different sizes. The MCC is in essence a correlation coefficient value | |
| between -1 and +1. A coefficient of +1 represents a perfect prediction, 0 | |
| an average random prediction and -1 an inverse prediction. The statistic | |
| is also known as the phi coefficient. [source: Wikipedia] | |
| Binary and multiclass labels are supported. Only in the binary case does | |
| this relate to information about true and false positives and negatives. | |
| See references below. | |
| Read more in the :ref:`User Guide <matthews_corrcoef>`. | |
| Parameters | |
| ---------- | |
| y_true : array, shape = [n_samples] | |
| Ground truth (correct) target values. | |
| y_pred : array, shape = [n_samples] | |
| Estimated targets as returned by a classifier. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| Returns | |
| ------- | |
| mcc : float | |
| The Matthews correlation coefficient (+1 represents a perfect | |
| prediction, 0 an average random prediction and -1 and inverse | |
| prediction). | |
| References | |
| ---------- | |
| .. [1] `Baldi, Brunak, Chauvin, Andersen and Nielsen, (2000). Assessing the | |
| accuracy of prediction algorithms for classification: an overview | |
| <https://doi.org/10.1093/bioinformatics/16.5.412>`_ | |
| .. [2] `Wikipedia entry for the Matthews Correlation Coefficient | |
| <https://en.wikipedia.org/wiki/Matthews_correlation_coefficient>`_ | |
| .. [3] `Gorodkin, (2004). Comparing two K-category assignments by a | |
| K-category correlation coefficient | |
| <https://www.sciencedirect.com/science/article/pii/S1476927104000799>`_ | |
| .. [4] `Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC and CEN | |
| Error Measures in MultiClass Prediction | |
| <https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0041882>`_ | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import matthews_corrcoef | |
| >>> y_true = [+1, +1, +1, -1] | |
| >>> y_pred = [+1, -1, +1, +1] | |
| >>> matthews_corrcoef(y_true, y_pred) | |
| -0.33... | |
| """ | |
| y_type, y_true, y_pred = _check_targets(y_true, y_pred) | |
| check_consistent_length(y_true, y_pred, sample_weight) | |
| if y_type not in {"binary", "multiclass"}: | |
| raise ValueError("%s is not supported" % y_type) | |
| lb = LabelEncoder() | |
| lb.fit(np.hstack([y_true, y_pred])) | |
| y_true = lb.transform(y_true) | |
| y_pred = lb.transform(y_pred) | |
| C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight) | |
| t_sum = C.sum(axis=1, dtype=np.float64) | |
| p_sum = C.sum(axis=0, dtype=np.float64) | |
| n_correct = np.trace(C, dtype=np.float64) | |
| n_samples = p_sum.sum() | |
| cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum) | |
| cov_ypyp = n_samples ** 2 - np.dot(p_sum, p_sum) | |
| cov_ytyt = n_samples ** 2 - np.dot(t_sum, t_sum) | |
| mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) | |
| if np.isnan(mcc): | |
| return 0. | |
| else: | |
| return mcc | |
| def zero_one_loss(y_true, y_pred, normalize=True, sample_weight=None): | |
| """Zero-one classification loss. | |
| If normalize is ``True``, return the fraction of misclassifications | |
| (float), else it returns the number of misclassifications (int). The best | |
| performance is 0. | |
| Read more in the :ref:`User Guide <zero_one_loss>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) labels. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Predicted labels, as returned by a classifier. | |
| normalize : bool, optional (default=True) | |
| If ``False``, return the number of misclassifications. | |
| Otherwise, return the fraction of misclassifications. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| Returns | |
| ------- | |
| loss : float or int, | |
| If ``normalize == True``, return the fraction of misclassifications | |
| (float), else it returns the number of misclassifications (int). | |
| Notes | |
| ----- | |
| In multilabel classification, the zero_one_loss function corresponds to | |
| the subset zero-one loss: for each sample, the entire set of labels must be | |
| correctly predicted, otherwise the loss for that sample is equal to one. | |
| See also | |
| -------- | |
| accuracy_score, hamming_loss, jaccard_score | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import zero_one_loss | |
| >>> y_pred = [1, 2, 3, 4] | |
| >>> y_true = [2, 2, 3, 4] | |
| >>> zero_one_loss(y_true, y_pred) | |
| 0.25 | |
| >>> zero_one_loss(y_true, y_pred, normalize=False) | |
| 1 | |
| In the multilabel case with binary label indicators: | |
| >>> import numpy as np | |
| >>> zero_one_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) | |
| 0.5 | |
| """ | |
| score = accuracy_score(y_true, y_pred, | |
| normalize=normalize, | |
| sample_weight=sample_weight) | |
| if normalize: | |
| return 1 - score | |
| else: | |
| if sample_weight is not None: | |
| n_samples = np.sum(sample_weight) | |
| else: | |
| n_samples = _num_samples(y_true) | |
| return n_samples - score | |
| def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', | |
| sample_weight=None, zero_division="warn"): | |
| """Compute the F1 score, also known as balanced F-score or F-measure | |
| The F1 score can be interpreted as a weighted average of the precision and | |
| recall, where an F1 score reaches its best value at 1 and worst score at 0. | |
| The relative contribution of precision and recall to the F1 score are | |
| equal. The formula for the F1 score is:: | |
| F1 = 2 * (precision * recall) / (precision + recall) | |
| In the multi-class and multi-label case, this is the average of | |
| the F1 score of each class with weighting depending on the ``average`` | |
| parameter. | |
| Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) target values. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Estimated targets as returned by a classifier. | |
| labels : list, optional | |
| The set of labels to include when ``average != 'binary'``, and their | |
| order if ``average is None``. Labels present in the data can be | |
| excluded, for example to calculate a multiclass average ignoring a | |
| majority negative class, while labels not present in the data will | |
| result in 0 components in a macro average. For multilabel targets, | |
| labels are column indices. By default, all labels in ``y_true`` and | |
| ``y_pred`` are used in sorted order. | |
| .. versionchanged:: 0.17 | |
| parameter *labels* improved for multiclass problem. | |
| pos_label : str or int, 1 by default | |
| The class to report if ``average='binary'`` and the data is binary. | |
| If the data are multiclass or multilabel, this will be ignored; | |
| setting ``labels=[pos_label]`` and ``average != 'binary'`` will report | |
| scores for that label only. | |
| average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ | |
| 'weighted'] | |
| This parameter is required for multiclass/multilabel targets. | |
| If ``None``, the scores for each class are returned. Otherwise, this | |
| determines the type of averaging performed on the data: | |
| ``'binary'``: | |
| Only report results for the class specified by ``pos_label``. | |
| This is applicable only if targets (``y_{true,pred}``) are binary. | |
| ``'micro'``: | |
| Calculate metrics globally by counting the total true positives, | |
| false negatives and false positives. | |
| ``'macro'``: | |
| Calculate metrics for each label, and find their unweighted | |
| mean. This does not take label imbalance into account. | |
| ``'weighted'``: | |
| Calculate metrics for each label, and find their average weighted | |
| by support (the number of true instances for each label). This | |
| alters 'macro' to account for label imbalance; it can result in an | |
| F-score that is not between precision and recall. | |
| ``'samples'``: | |
| Calculate metrics for each instance, and find their average (only | |
| meaningful for multilabel classification where this differs from | |
| :func:`accuracy_score`). | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| zero_division : "warn", 0 or 1, default="warn" | |
| Sets the value to return when there is a zero division, i.e. when all | |
| predictions and labels are negative. If set to "warn", this acts as 0, | |
| but warnings are also raised. | |
| Returns | |
| ------- | |
| f1_score : float or array of float, shape = [n_unique_labels] | |
| F1 score of the positive class in binary classification or weighted | |
| average of the F1 scores of each class for the multiclass task. | |
| See also | |
| -------- | |
| fbeta_score, precision_recall_fscore_support, jaccard_score, | |
| multilabel_confusion_matrix | |
| References | |
| ---------- | |
| .. [1] `Wikipedia entry for the F1-score | |
| <https://en.wikipedia.org/wiki/F1_score>`_ | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import f1_score | |
| >>> y_true = [0, 1, 2, 0, 1, 2] | |
| >>> y_pred = [0, 2, 1, 0, 0, 1] | |
| >>> f1_score(y_true, y_pred, average='macro') | |
| 0.26... | |
| >>> f1_score(y_true, y_pred, average='micro') | |
| 0.33... | |
| >>> f1_score(y_true, y_pred, average='weighted') | |
| 0.26... | |
| >>> f1_score(y_true, y_pred, average=None) | |
| array([0.8, 0. , 0. ]) | |
| >>> y_true = [0, 0, 0, 0, 0, 0] | |
| >>> y_pred = [0, 0, 0, 0, 0, 0] | |
| >>> f1_score(y_true, y_pred, zero_division=1) | |
| 1.0... | |
| Notes | |
| ----- | |
| When ``true positive + false positive == 0``, precision is undefined; | |
| When ``true positive + false negative == 0``, recall is undefined. | |
| In such cases, by default the metric will be set to 0, as will f-score, | |
| and ``UndefinedMetricWarning`` will be raised. This behavior can be | |
| modified with ``zero_division``. | |
| """ | |
| return fbeta_score(y_true, y_pred, 1, labels=labels, | |
| pos_label=pos_label, average=average, | |
| sample_weight=sample_weight, | |
| zero_division=zero_division) | |
| def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, | |
| average='binary', sample_weight=None, zero_division="warn"): | |
| """Compute the F-beta score | |
| The F-beta score is the weighted harmonic mean of precision and recall, | |
| reaching its optimal value at 1 and its worst value at 0. | |
| The `beta` parameter determines the weight of recall in the combined | |
| score. ``beta < 1`` lends more weight to precision, while ``beta > 1`` | |
| favors recall (``beta -> 0`` considers only precision, ``beta -> +inf`` | |
| only recall). | |
| Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) target values. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Estimated targets as returned by a classifier. | |
| beta : float | |
| Determines the weight of recall in the combined score. | |
| labels : list, optional | |
| The set of labels to include when ``average != 'binary'``, and their | |
| order if ``average is None``. Labels present in the data can be | |
| excluded, for example to calculate a multiclass average ignoring a | |
| majority negative class, while labels not present in the data will | |
| result in 0 components in a macro average. For multilabel targets, | |
| labels are column indices. By default, all labels in ``y_true`` and | |
| ``y_pred`` are used in sorted order. | |
| .. versionchanged:: 0.17 | |
| parameter *labels* improved for multiclass problem. | |
| pos_label : str or int, 1 by default | |
| The class to report if ``average='binary'`` and the data is binary. | |
| If the data are multiclass or multilabel, this will be ignored; | |
| setting ``labels=[pos_label]`` and ``average != 'binary'`` will report | |
| scores for that label only. | |
| average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ | |
| 'weighted'] | |
| This parameter is required for multiclass/multilabel targets. | |
| If ``None``, the scores for each class are returned. Otherwise, this | |
| determines the type of averaging performed on the data: | |
| ``'binary'``: | |
| Only report results for the class specified by ``pos_label``. | |
| This is applicable only if targets (``y_{true,pred}``) are binary. | |
| ``'micro'``: | |
| Calculate metrics globally by counting the total true positives, | |
| false negatives and false positives. | |
| ``'macro'``: | |
| Calculate metrics for each label, and find their unweighted | |
| mean. This does not take label imbalance into account. | |
| ``'weighted'``: | |
| Calculate metrics for each label, and find their average weighted | |
| by support (the number of true instances for each label). This | |
| alters 'macro' to account for label imbalance; it can result in an | |
| F-score that is not between precision and recall. | |
| ``'samples'``: | |
| Calculate metrics for each instance, and find their average (only | |
| meaningful for multilabel classification where this differs from | |
| :func:`accuracy_score`). | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| zero_division : "warn", 0 or 1, default="warn" | |
| Sets the value to return when there is a zero division, i.e. when all | |
| predictions and labels are negative. If set to "warn", this acts as 0, | |
| but warnings are also raised. | |
| Returns | |
| ------- | |
| fbeta_score : float (if average is not None) or array of float, shape =\ | |
| [n_unique_labels] | |
| F-beta score of the positive class in binary classification or weighted | |
| average of the F-beta score of each class for the multiclass task. | |
| See also | |
| -------- | |
| precision_recall_fscore_support, multilabel_confusion_matrix | |
| References | |
| ---------- | |
| .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011). | |
| Modern Information Retrieval. Addison Wesley, pp. 327-328. | |
| .. [2] `Wikipedia entry for the F1-score | |
| <https://en.wikipedia.org/wiki/F1_score>`_ | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import fbeta_score | |
| >>> y_true = [0, 1, 2, 0, 1, 2] | |
| >>> y_pred = [0, 2, 1, 0, 0, 1] | |
| >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) | |
| 0.23... | |
| >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) | |
| 0.33... | |
| >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) | |
| 0.23... | |
| >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) | |
| array([0.71..., 0. , 0. ]) | |
| Notes | |
| ----- | |
| When ``true positive + false positive == 0`` or | |
| ``true positive + false negative == 0``, f-score returns 0 and raises | |
| ``UndefinedMetricWarning``. This behavior can be | |
| modified with ``zero_division``. | |
| """ | |
| _, _, f, _ = precision_recall_fscore_support(y_true, y_pred, | |
| beta=beta, | |
| labels=labels, | |
| pos_label=pos_label, | |
| average=average, | |
| warn_for=('f-score',), | |
| sample_weight=sample_weight, | |
| zero_division=zero_division) | |
| return f | |
| def _prf_divide(numerator, denominator, metric, | |
| modifier, average, warn_for, zero_division="warn"): | |
| """Performs division and handles divide-by-zero. | |
| On zero-division, sets the corresponding result elements equal to | |
| 0 or 1 (according to ``zero_division``). Plus, if | |
| ``zero_division != "warn"`` raises a warning. | |
| The metric, modifier and average arguments are used only for determining | |
| an appropriate warning. | |
| """ | |
| mask = denominator == 0.0 | |
| denominator = denominator.copy() | |
| denominator[mask] = 1 # avoid infs/nans | |
| result = numerator / denominator | |
| if not np.any(mask): | |
| return result | |
| # if ``zero_division=1``, set those with denominator == 0 equal to 1 | |
| result[mask] = 0.0 if zero_division in ["warn", 0] else 1.0 | |
| # the user will be removing warnings if zero_division is set to something | |
| # different than its default value. If we are computing only f-score | |
| # the warning will be raised only if precision and recall are ill-defined | |
| if zero_division != "warn" or metric not in warn_for: | |
| return result | |
| # build appropriate warning | |
| # E.g. "Precision and F-score are ill-defined and being set to 0.0 in | |
| # labels with no predicted samples. Use ``zero_division`` parameter to | |
| # control this behavior." | |
| if metric in warn_for and 'f-score' in warn_for: | |
| msg_start = '{0} and F-score are'.format(metric.title()) | |
| elif metric in warn_for: | |
| msg_start = '{0} is'.format(metric.title()) | |
| elif 'f-score' in warn_for: | |
| msg_start = 'F-score is' | |
| else: | |
| return result | |
| _warn_prf(average, modifier, msg_start, len(result)) | |
| return result | |
| def _warn_prf(average, modifier, msg_start, result_size): | |
| axis0, axis1 = 'sample', 'label' | |
| if average == 'samples': | |
| axis0, axis1 = axis1, axis0 | |
| msg = ('{0} ill-defined and being set to 0.0 {{0}} ' | |
| 'no {1} {2}s. Use `zero_division` parameter to control' | |
| ' this behavior.'.format(msg_start, modifier, axis0)) | |
| if result_size == 1: | |
| msg = msg.format('due to') | |
| else: | |
| msg = msg.format('in {0}s with'.format(axis1)) | |
| warnings.warn(msg, UndefinedMetricWarning, stacklevel=2) | |
| def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label): | |
| """Validation associated with set-wise metrics | |
| Returns identified labels | |
| """ | |
| average_options = (None, 'micro', 'macro', 'weighted', 'samples') | |
| if average not in average_options and average != 'binary': | |
| raise ValueError('average has to be one of ' + | |
| str(average_options)) | |
| y_type, y_true, y_pred = _check_targets(y_true, y_pred) | |
| present_labels = unique_labels(y_true, y_pred) | |
| if average == 'binary': | |
| if y_type == 'binary': | |
| if pos_label not in present_labels: | |
| if len(present_labels) >= 2: | |
| raise ValueError("pos_label=%r is not a valid label: " | |
| "%r" % (pos_label, present_labels)) | |
| labels = [pos_label] | |
| else: | |
| average_options = list(average_options) | |
| if y_type == 'multiclass': | |
| average_options.remove('samples') | |
| raise ValueError("Target is %s but average='binary'. Please " | |
| "choose another average setting, one of %r." | |
| % (y_type, average_options)) | |
| elif pos_label not in (None, 1): | |
| warnings.warn("Note that pos_label (set to %r) is ignored when " | |
| "average != 'binary' (got %r). You may use " | |
| "labels=[pos_label] to specify a single positive class." | |
| % (pos_label, average), UserWarning) | |
| return labels | |
| def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, | |
| pos_label=1, average=None, | |
| warn_for=('precision', 'recall', | |
| 'f-score'), | |
| sample_weight=None, | |
| zero_division="warn"): | |
| """Compute precision, recall, F-measure and support for each class | |
| The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of | |
| true positives and ``fp`` the number of false positives. The precision is | |
| intuitively the ability of the classifier not to label as positive a sample | |
| that is negative. | |
| The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of | |
| true positives and ``fn`` the number of false negatives. The recall is | |
| intuitively the ability of the classifier to find all the positive samples. | |
| The F-beta score can be interpreted as a weighted harmonic mean of | |
| the precision and recall, where an F-beta score reaches its best | |
| value at 1 and worst score at 0. | |
| The F-beta score weights recall more than precision by a factor of | |
| ``beta``. ``beta == 1.0`` means recall and precision are equally important. | |
| The support is the number of occurrences of each class in ``y_true``. | |
| If ``pos_label is None`` and in binary classification, this function | |
| returns the average precision, recall and F-measure if ``average`` | |
| is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``. | |
| Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) target values. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Estimated targets as returned by a classifier. | |
| beta : float, 1.0 by default | |
| The strength of recall versus precision in the F-score. | |
| labels : list, optional | |
| The set of labels to include when ``average != 'binary'``, and their | |
| order if ``average is None``. Labels present in the data can be | |
| excluded, for example to calculate a multiclass average ignoring a | |
| majority negative class, while labels not present in the data will | |
| result in 0 components in a macro average. For multilabel targets, | |
| labels are column indices. By default, all labels in ``y_true`` and | |
| ``y_pred`` are used in sorted order. | |
| pos_label : str or int, 1 by default | |
| The class to report if ``average='binary'`` and the data is binary. | |
| If the data are multiclass or multilabel, this will be ignored; | |
| setting ``labels=[pos_label]`` and ``average != 'binary'`` will report | |
| scores for that label only. | |
| average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \ | |
| 'weighted'] | |
| If ``None``, the scores for each class are returned. Otherwise, this | |
| determines the type of averaging performed on the data: | |
| ``'binary'``: | |
| Only report results for the class specified by ``pos_label``. | |
| This is applicable only if targets (``y_{true,pred}``) are binary. | |
| ``'micro'``: | |
| Calculate metrics globally by counting the total true positives, | |
| false negatives and false positives. | |
| ``'macro'``: | |
| Calculate metrics for each label, and find their unweighted | |
| mean. This does not take label imbalance into account. | |
| ``'weighted'``: | |
| Calculate metrics for each label, and find their average weighted | |
| by support (the number of true instances for each label). This | |
| alters 'macro' to account for label imbalance; it can result in an | |
| F-score that is not between precision and recall. | |
| ``'samples'``: | |
| Calculate metrics for each instance, and find their average (only | |
| meaningful for multilabel classification where this differs from | |
| :func:`accuracy_score`). | |
| warn_for : tuple or set, for internal use | |
| This determines which warnings will be made in the case that this | |
| function is being used to return only one of its metrics. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| zero_division : "warn", 0 or 1, default="warn" | |
| Sets the value to return when there is a zero division: | |
| - recall: when there are no positive labels | |
| - precision: when there are no positive predictions | |
| - f-score: both | |
| If set to "warn", this acts as 0, but warnings are also raised. | |
| Returns | |
| ------- | |
| precision : float (if average is not None) or array of float, shape =\ | |
| [n_unique_labels] | |
| recall : float (if average is not None) or array of float, , shape =\ | |
| [n_unique_labels] | |
| fbeta_score : float (if average is not None) or array of float, shape =\ | |
| [n_unique_labels] | |
| support : None (if average is not None) or array of int, shape =\ | |
| [n_unique_labels] | |
| The number of occurrences of each label in ``y_true``. | |
| References | |
| ---------- | |
| .. [1] `Wikipedia entry for the Precision and recall | |
| <https://en.wikipedia.org/wiki/Precision_and_recall>`_ | |
| .. [2] `Wikipedia entry for the F1-score | |
| <https://en.wikipedia.org/wiki/F1_score>`_ | |
| .. [3] `Discriminative Methods for Multi-labeled Classification Advances | |
| in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu | |
| Godbole, Sunita Sarawagi | |
| <http://www.godbole.net/shantanu/pubs/multilabelsvm-pakdd04.pdf>`_ | |
| Examples | |
| -------- | |
| >>> import numpy as np | |
| >>> from sklearn.metrics import precision_recall_fscore_support | |
| >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) | |
| >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) | |
| >>> precision_recall_fscore_support(y_true, y_pred, average='macro') | |
| (0.22..., 0.33..., 0.26..., None) | |
| >>> precision_recall_fscore_support(y_true, y_pred, average='micro') | |
| (0.33..., 0.33..., 0.33..., None) | |
| >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') | |
| (0.22..., 0.33..., 0.26..., None) | |
| It is possible to compute per-label precisions, recalls, F1-scores and | |
| supports instead of averaging: | |
| >>> precision_recall_fscore_support(y_true, y_pred, average=None, | |
| ... labels=['pig', 'dog', 'cat']) | |
| (array([0. , 0. , 0.66...]), | |
| array([0., 0., 1.]), array([0. , 0. , 0.8]), | |
| array([2, 2, 2])) | |
| Notes | |
| ----- | |
| When ``true positive + false positive == 0``, precision is undefined; | |
| When ``true positive + false negative == 0``, recall is undefined. | |
| In such cases, by default the metric will be set to 0, as will f-score, | |
| and ``UndefinedMetricWarning`` will be raised. This behavior can be | |
| modified with ``zero_division``. | |
| """ | |
| _check_zero_division(zero_division) | |
| if beta < 0: | |
| raise ValueError("beta should be >=0 in the F-beta score") | |
| labels = _check_set_wise_labels(y_true, y_pred, average, labels, | |
| pos_label) | |
| # Calculate tp_sum, pred_sum, true_sum ### | |
| samplewise = average == 'samples' | |
| MCM = multilabel_confusion_matrix(y_true, y_pred, | |
| sample_weight=sample_weight, | |
| labels=labels, samplewise=samplewise) | |
| tp_sum = MCM[:, 1, 1] | |
| pred_sum = tp_sum + MCM[:, 0, 1] | |
| true_sum = tp_sum + MCM[:, 1, 0] | |
| if average == 'micro': | |
| tp_sum = np.array([tp_sum.sum()]) | |
| pred_sum = np.array([pred_sum.sum()]) | |
| true_sum = np.array([true_sum.sum()]) | |
| # Finally, we have all our sufficient statistics. Divide! # | |
| beta2 = beta ** 2 | |
| # Divide, and on zero-division, set scores and/or warn according to | |
| # zero_division: | |
| precision = _prf_divide(tp_sum, pred_sum, 'precision', | |
| 'predicted', average, warn_for, zero_division) | |
| recall = _prf_divide(tp_sum, true_sum, 'recall', | |
| 'true', average, warn_for, zero_division) | |
| # warn for f-score only if zero_division is warn, it is in warn_for | |
| # and BOTH prec and rec are ill-defined | |
| if zero_division == "warn" and ("f-score",) == warn_for: | |
| if (pred_sum[true_sum == 0] == 0).any(): | |
| _warn_prf( | |
| average, "true nor predicted", 'F-score is', len(true_sum) | |
| ) | |
| # if tp == 0 F will be 1 only if all predictions are zero, all labels are | |
| # zero, and zero_division=1. In all other case, 0 | |
| if np.isposinf(beta): | |
| f_score = recall | |
| else: | |
| denom = beta2 * precision + recall | |
| denom[denom == 0.] = 1 # avoid division by 0 | |
| f_score = (1 + beta2) * precision * recall / denom | |
| # Average the results | |
| if average == 'weighted': | |
| weights = true_sum | |
| if weights.sum() == 0: | |
| zero_division_value = 0.0 if zero_division in ["warn", 0] else 1.0 | |
| # precision is zero_division if there are no positive predictions | |
| # recall is zero_division if there are no positive labels | |
| # fscore is zero_division if all labels AND predictions are | |
| # negative | |
| return (zero_division_value if pred_sum.sum() == 0 else 0, | |
| zero_division_value, | |
| zero_division_value if pred_sum.sum() == 0 else 0, | |
| None) | |
| elif average == 'samples': | |
| weights = sample_weight | |
| else: | |
| weights = None | |
| if average is not None: | |
| assert average != 'binary' or len(precision) == 1 | |
| precision = np.average(precision, weights=weights) | |
| recall = np.average(recall, weights=weights) | |
| f_score = np.average(f_score, weights=weights) | |
| true_sum = None # return no support | |
| return precision, recall, f_score, true_sum | |
| def precision_score(y_true, y_pred, labels=None, pos_label=1, | |
| average='binary', sample_weight=None, | |
| zero_division="warn"): | |
| """Compute the precision | |
| The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of | |
| true positives and ``fp`` the number of false positives. The precision is | |
| intuitively the ability of the classifier not to label as positive a sample | |
| that is negative. | |
| The best value is 1 and the worst value is 0. | |
| Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) target values. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Estimated targets as returned by a classifier. | |
| labels : list, optional | |
| The set of labels to include when ``average != 'binary'``, and their | |
| order if ``average is None``. Labels present in the data can be | |
| excluded, for example to calculate a multiclass average ignoring a | |
| majority negative class, while labels not present in the data will | |
| result in 0 components in a macro average. For multilabel targets, | |
| labels are column indices. By default, all labels in ``y_true`` and | |
| ``y_pred`` are used in sorted order. | |
| .. versionchanged:: 0.17 | |
| parameter *labels* improved for multiclass problem. | |
| pos_label : str or int, 1 by default | |
| The class to report if ``average='binary'`` and the data is binary. | |
| If the data are multiclass or multilabel, this will be ignored; | |
| setting ``labels=[pos_label]`` and ``average != 'binary'`` will report | |
| scores for that label only. | |
| average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ | |
| 'weighted'] | |
| This parameter is required for multiclass/multilabel targets. | |
| If ``None``, the scores for each class are returned. Otherwise, this | |
| determines the type of averaging performed on the data: | |
| ``'binary'``: | |
| Only report results for the class specified by ``pos_label``. | |
| This is applicable only if targets (``y_{true,pred}``) are binary. | |
| ``'micro'``: | |
| Calculate metrics globally by counting the total true positives, | |
| false negatives and false positives. | |
| ``'macro'``: | |
| Calculate metrics for each label, and find their unweighted | |
| mean. This does not take label imbalance into account. | |
| ``'weighted'``: | |
| Calculate metrics for each label, and find their average weighted | |
| by support (the number of true instances for each label). This | |
| alters 'macro' to account for label imbalance; it can result in an | |
| F-score that is not between precision and recall. | |
| ``'samples'``: | |
| Calculate metrics for each instance, and find their average (only | |
| meaningful for multilabel classification where this differs from | |
| :func:`accuracy_score`). | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| zero_division : "warn", 0 or 1, default="warn" | |
| Sets the value to return when there is a zero division. If set to | |
| "warn", this acts as 0, but warnings are also raised. | |
| Returns | |
| ------- | |
| precision : float (if average is not None) or array of float, shape =\ | |
| [n_unique_labels] | |
| Precision of the positive class in binary classification or weighted | |
| average of the precision of each class for the multiclass task. | |
| See also | |
| -------- | |
| precision_recall_fscore_support, multilabel_confusion_matrix | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import precision_score | |
| >>> y_true = [0, 1, 2, 0, 1, 2] | |
| >>> y_pred = [0, 2, 1, 0, 0, 1] | |
| >>> precision_score(y_true, y_pred, average='macro') | |
| 0.22... | |
| >>> precision_score(y_true, y_pred, average='micro') | |
| 0.33... | |
| >>> precision_score(y_true, y_pred, average='weighted') | |
| 0.22... | |
| >>> precision_score(y_true, y_pred, average=None) | |
| array([0.66..., 0. , 0. ]) | |
| >>> y_pred = [0, 0, 0, 0, 0, 0] | |
| >>> precision_score(y_true, y_pred, average=None) | |
| array([0.33..., 0. , 0. ]) | |
| >>> precision_score(y_true, y_pred, average=None, zero_division=1) | |
| array([0.33..., 1. , 1. ]) | |
| Notes | |
| ----- | |
| When ``true positive + false positive == 0``, precision returns 0 and | |
| raises ``UndefinedMetricWarning``. This behavior can be | |
| modified with ``zero_division``. | |
| """ | |
| p, _, _, _ = precision_recall_fscore_support(y_true, y_pred, | |
| labels=labels, | |
| pos_label=pos_label, | |
| average=average, | |
| warn_for=('precision',), | |
| sample_weight=sample_weight, | |
| zero_division=zero_division) | |
| return p | |
| def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary', | |
| sample_weight=None, zero_division="warn"): | |
| """Compute the recall | |
| The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of | |
| true positives and ``fn`` the number of false negatives. The recall is | |
| intuitively the ability of the classifier to find all the positive samples. | |
| The best value is 1 and the worst value is 0. | |
| Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) target values. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Estimated targets as returned by a classifier. | |
| labels : list, optional | |
| The set of labels to include when ``average != 'binary'``, and their | |
| order if ``average is None``. Labels present in the data can be | |
| excluded, for example to calculate a multiclass average ignoring a | |
| majority negative class, while labels not present in the data will | |
| result in 0 components in a macro average. For multilabel targets, | |
| labels are column indices. By default, all labels in ``y_true`` and | |
| ``y_pred`` are used in sorted order. | |
| .. versionchanged:: 0.17 | |
| parameter *labels* improved for multiclass problem. | |
| pos_label : str or int, 1 by default | |
| The class to report if ``average='binary'`` and the data is binary. | |
| If the data are multiclass or multilabel, this will be ignored; | |
| setting ``labels=[pos_label]`` and ``average != 'binary'`` will report | |
| scores for that label only. | |
| average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ | |
| 'weighted'] | |
| This parameter is required for multiclass/multilabel targets. | |
| If ``None``, the scores for each class are returned. Otherwise, this | |
| determines the type of averaging performed on the data: | |
| ``'binary'``: | |
| Only report results for the class specified by ``pos_label``. | |
| This is applicable only if targets (``y_{true,pred}``) are binary. | |
| ``'micro'``: | |
| Calculate metrics globally by counting the total true positives, | |
| false negatives and false positives. | |
| ``'macro'``: | |
| Calculate metrics for each label, and find their unweighted | |
| mean. This does not take label imbalance into account. | |
| ``'weighted'``: | |
| Calculate metrics for each label, and find their average weighted | |
| by support (the number of true instances for each label). This | |
| alters 'macro' to account for label imbalance; it can result in an | |
| F-score that is not between precision and recall. | |
| ``'samples'``: | |
| Calculate metrics for each instance, and find their average (only | |
| meaningful for multilabel classification where this differs from | |
| :func:`accuracy_score`). | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| zero_division : "warn", 0 or 1, default="warn" | |
| Sets the value to return when there is a zero division. If set to | |
| "warn", this acts as 0, but warnings are also raised. | |
| Returns | |
| ------- | |
| recall : float (if average is not None) or array of float, shape =\ | |
| [n_unique_labels] | |
| Recall of the positive class in binary classification or weighted | |
| average of the recall of each class for the multiclass task. | |
| See also | |
| -------- | |
| precision_recall_fscore_support, balanced_accuracy_score, | |
| multilabel_confusion_matrix | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import recall_score | |
| >>> y_true = [0, 1, 2, 0, 1, 2] | |
| >>> y_pred = [0, 2, 1, 0, 0, 1] | |
| >>> recall_score(y_true, y_pred, average='macro') | |
| 0.33... | |
| >>> recall_score(y_true, y_pred, average='micro') | |
| 0.33... | |
| >>> recall_score(y_true, y_pred, average='weighted') | |
| 0.33... | |
| >>> recall_score(y_true, y_pred, average=None) | |
| array([1., 0., 0.]) | |
| >>> y_true = [0, 0, 0, 0, 0, 0] | |
| >>> recall_score(y_true, y_pred, average=None) | |
| array([0.5, 0. , 0. ]) | |
| >>> recall_score(y_true, y_pred, average=None, zero_division=1) | |
| array([0.5, 1. , 1. ]) | |
| Notes | |
| ----- | |
| When ``true positive + false negative == 0``, recall returns 0 and raises | |
| ``UndefinedMetricWarning``. This behavior can be modified with | |
| ``zero_division``. | |
| """ | |
| _, r, _, _ = precision_recall_fscore_support(y_true, y_pred, | |
| labels=labels, | |
| pos_label=pos_label, | |
| average=average, | |
| warn_for=('recall',), | |
| sample_weight=sample_weight, | |
| zero_division=zero_division) | |
| return r | |
| def balanced_accuracy_score(y_true, y_pred, sample_weight=None, | |
| adjusted=False): | |
| """Compute the balanced accuracy | |
| The balanced accuracy in binary and multiclass classification problems to | |
| deal with imbalanced datasets. It is defined as the average of recall | |
| obtained on each class. | |
| The best value is 1 and the worst value is 0 when ``adjusted=False``. | |
| Read more in the :ref:`User Guide <balanced_accuracy_score>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like | |
| Ground truth (correct) target values. | |
| y_pred : 1d array-like | |
| Estimated targets as returned by a classifier. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| adjusted : bool, default=False | |
| When true, the result is adjusted for chance, so that random | |
| performance would score 0, and perfect performance scores 1. | |
| Returns | |
| ------- | |
| balanced_accuracy : float | |
| See also | |
| -------- | |
| recall_score, roc_auc_score | |
| Notes | |
| ----- | |
| Some literature promotes alternative definitions of balanced accuracy. Our | |
| definition is equivalent to :func:`accuracy_score` with class-balanced | |
| sample weights, and shares desirable properties with the binary case. | |
| See the :ref:`User Guide <balanced_accuracy_score>`. | |
| References | |
| ---------- | |
| .. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010). | |
| The balanced accuracy and its posterior distribution. | |
| Proceedings of the 20th International Conference on Pattern | |
| Recognition, 3121-24. | |
| .. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015). | |
| `Fundamentals of Machine Learning for Predictive Data Analytics: | |
| Algorithms, Worked Examples, and Case Studies | |
| <https://mitpress.mit.edu/books/fundamentals-machine-learning-predictive-data-analytics>`_. | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import balanced_accuracy_score | |
| >>> y_true = [0, 1, 0, 0, 1, 0] | |
| >>> y_pred = [0, 1, 0, 0, 0, 1] | |
| >>> balanced_accuracy_score(y_true, y_pred) | |
| 0.625 | |
| """ | |
| C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight) | |
| with np.errstate(divide='ignore', invalid='ignore'): | |
| per_class = np.diag(C) / C.sum(axis=1) | |
| if np.any(np.isnan(per_class)): | |
| warnings.warn('y_pred contains classes not in y_true') | |
| per_class = per_class[~np.isnan(per_class)] | |
| score = np.mean(per_class) | |
| if adjusted: | |
| n_classes = len(per_class) | |
| chance = 1 / n_classes | |
| score -= chance | |
| score /= 1 - chance | |
| return score | |
| def classification_report(y_true, y_pred, labels=None, target_names=None, | |
| sample_weight=None, digits=2, output_dict=False, | |
| zero_division="warn"): | |
| """Build a text report showing the main classification metrics | |
| Read more in the :ref:`User Guide <classification_report>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) target values. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Estimated targets as returned by a classifier. | |
| labels : array, shape = [n_labels] | |
| Optional list of label indices to include in the report. | |
| target_names : list of strings | |
| Optional display names matching the labels (same order). | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| digits : int | |
| Number of digits for formatting output floating point values. | |
| When ``output_dict`` is ``True``, this will be ignored and the | |
| returned values will not be rounded. | |
| output_dict : bool (default = False) | |
| If True, return output as dict | |
| zero_division : "warn", 0 or 1, default="warn" | |
| Sets the value to return when there is a zero division. If set to | |
| "warn", this acts as 0, but warnings are also raised. | |
| Returns | |
| ------- | |
| report : string / dict | |
| Text summary of the precision, recall, F1 score for each class. | |
| Dictionary returned if output_dict is True. Dictionary has the | |
| following structure:: | |
| {'label 1': {'precision':0.5, | |
| 'recall':1.0, | |
| 'f1-score':0.67, | |
| 'support':1}, | |
| 'label 2': { ... }, | |
| ... | |
| } | |
| The reported averages include macro average (averaging the unweighted | |
| mean per label), weighted average (averaging the support-weighted mean | |
| per label), and sample average (only for multilabel classification). | |
| Micro average (averaging the total true positives, false negatives and | |
| false positives) is only shown for multi-label or multi-class | |
| with a subset of classes, because it corresponds to accuracy otherwise. | |
| See also :func:`precision_recall_fscore_support` for more details | |
| on averages. | |
| Note that in binary classification, recall of the positive class | |
| is also known as "sensitivity"; recall of the negative class is | |
| "specificity". | |
| See also | |
| -------- | |
| precision_recall_fscore_support, confusion_matrix, | |
| multilabel_confusion_matrix | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import classification_report | |
| >>> y_true = [0, 1, 2, 2, 2] | |
| >>> y_pred = [0, 0, 2, 2, 1] | |
| >>> target_names = ['class 0', 'class 1', 'class 2'] | |
| >>> print(classification_report(y_true, y_pred, target_names=target_names)) | |
| precision recall f1-score support | |
| <BLANKLINE> | |
| class 0 0.50 1.00 0.67 1 | |
| class 1 0.00 0.00 0.00 1 | |
| class 2 1.00 0.67 0.80 3 | |
| <BLANKLINE> | |
| accuracy 0.60 5 | |
| macro avg 0.50 0.56 0.49 5 | |
| weighted avg 0.70 0.60 0.61 5 | |
| <BLANKLINE> | |
| >>> y_pred = [1, 1, 0] | |
| >>> y_true = [1, 1, 1] | |
| >>> print(classification_report(y_true, y_pred, labels=[1, 2, 3])) | |
| precision recall f1-score support | |
| <BLANKLINE> | |
| 1 1.00 0.67 0.80 3 | |
| 2 0.00 0.00 0.00 0 | |
| 3 0.00 0.00 0.00 0 | |
| <BLANKLINE> | |
| micro avg 1.00 0.67 0.80 3 | |
| macro avg 0.33 0.22 0.27 3 | |
| weighted avg 1.00 0.67 0.80 3 | |
| <BLANKLINE> | |
| """ | |
| y_type, y_true, y_pred = _check_targets(y_true, y_pred) | |
| labels_given = True | |
| if labels is None: | |
| labels = unique_labels(y_true, y_pred) | |
| labels_given = False | |
| else: | |
| labels = np.asarray(labels) | |
| # labelled micro average | |
| micro_is_accuracy = ((y_type == 'multiclass' or y_type == 'binary') and | |
| (not labels_given or | |
| (set(labels) == set(unique_labels(y_true, y_pred))))) | |
| if target_names is not None and len(labels) != len(target_names): | |
| if labels_given: | |
| warnings.warn( | |
| "labels size, {0}, does not match size of target_names, {1}" | |
| .format(len(labels), len(target_names)) | |
| ) | |
| else: | |
| raise ValueError( | |
| "Number of classes, {0}, does not match size of " | |
| "target_names, {1}. Try specifying the labels " | |
| "parameter".format(len(labels), len(target_names)) | |
| ) | |
| if target_names is None: | |
| target_names = ['%s' % l for l in labels] | |
| headers = ["precision", "recall", "f1-score", "support"] | |
| # compute per-class results without averaging | |
| p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, | |
| labels=labels, | |
| average=None, | |
| sample_weight=sample_weight, | |
| zero_division=zero_division) | |
| rows = zip(target_names, p, r, f1, s) | |
| if y_type.startswith('multilabel'): | |
| average_options = ('micro', 'macro', 'weighted', 'samples') | |
| else: | |
| average_options = ('micro', 'macro', 'weighted') | |
| if output_dict: | |
| report_dict = {label[0]: label[1:] for label in rows} | |
| for label, scores in report_dict.items(): | |
| report_dict[label] = dict(zip(headers, | |
| [i.item() for i in scores])) | |
| else: | |
| longest_last_line_heading = 'weighted avg' | |
| name_width = max(len(cn) for cn in target_names) | |
| width = max(name_width, len(longest_last_line_heading), digits) | |
| head_fmt = '{:>{width}s} ' + ' {:>9}' * len(headers) | |
| report = head_fmt.format('', *headers, width=width) | |
| report += '\n\n' | |
| row_fmt = '{:>{width}s} ' + ' {:>9.{digits}f}' * 3 + ' {:>9}\n' | |
| for row in rows: | |
| report += row_fmt.format(*row, width=width, digits=digits) | |
| report += '\n' | |
| # compute all applicable averages | |
| for average in average_options: | |
| if average.startswith('micro') and micro_is_accuracy: | |
| line_heading = 'accuracy' | |
| else: | |
| line_heading = average + ' avg' | |
| # compute averages with specified averaging method | |
| avg_p, avg_r, avg_f1, _ = precision_recall_fscore_support( | |
| y_true, y_pred, labels=labels, | |
| average=average, sample_weight=sample_weight, | |
| zero_division=zero_division) | |
| avg = [avg_p, avg_r, avg_f1, np.sum(s)] | |
| if output_dict: | |
| report_dict[line_heading] = dict( | |
| zip(headers, [i.item() for i in avg])) | |
| else: | |
| if line_heading == 'accuracy': | |
| row_fmt_accuracy = '{:>{width}s} ' + \ | |
| ' {:>9.{digits}}' * 2 + ' {:>9.{digits}f}' + \ | |
| ' {:>9}\n' | |
| report += row_fmt_accuracy.format(line_heading, '', '', | |
| *avg[2:], width=width, | |
| digits=digits) | |
| else: | |
| report += row_fmt.format(line_heading, *avg, | |
| width=width, digits=digits) | |
| if output_dict: | |
| if 'accuracy' in report_dict.keys(): | |
| report_dict['accuracy'] = report_dict['accuracy']['precision'] | |
| return report_dict | |
| else: | |
| return report | |
| def hamming_loss(y_true, y_pred, labels=None, sample_weight=None): | |
| """Compute the average Hamming loss. | |
| The Hamming loss is the fraction of labels that are incorrectly predicted. | |
| Read more in the :ref:`User Guide <hamming_loss>`. | |
| Parameters | |
| ---------- | |
| y_true : 1d array-like, or label indicator array / sparse matrix | |
| Ground truth (correct) labels. | |
| y_pred : 1d array-like, or label indicator array / sparse matrix | |
| Predicted labels, as returned by a classifier. | |
| labels : array, shape = [n_labels], optional (default='deprecated') | |
| Integer array of labels. If not provided, labels will be inferred | |
| from y_true and y_pred. | |
| .. versionadded:: 0.18 | |
| .. deprecated:: 0.21 | |
| This parameter ``labels`` is deprecated in version 0.21 and will | |
| be removed in version 0.23. Hamming loss uses ``y_true.shape[1]`` | |
| for the number of labels when y_true is binary label indicators, | |
| so it is unnecessary for the user to specify. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| .. versionadded:: 0.18 | |
| Returns | |
| ------- | |
| loss : float or int, | |
| Return the average Hamming loss between element of ``y_true`` and | |
| ``y_pred``. | |
| See Also | |
| -------- | |
| accuracy_score, jaccard_score, zero_one_loss | |
| Notes | |
| ----- | |
| In multiclass classification, the Hamming loss corresponds to the Hamming | |
| distance between ``y_true`` and ``y_pred`` which is equivalent to the | |
| subset ``zero_one_loss`` function, when `normalize` parameter is set to | |
| True. | |
| In multilabel classification, the Hamming loss is different from the | |
| subset zero-one loss. The zero-one loss considers the entire set of labels | |
| for a given sample incorrect if it does not entirely match the true set of | |
| labels. Hamming loss is more forgiving in that it penalizes only the | |
| individual labels. | |
| The Hamming loss is upperbounded by the subset zero-one loss, when | |
| `normalize` parameter is set to True. It is always between 0 and 1, | |
| lower being better. | |
| References | |
| ---------- | |
| .. [1] Grigorios Tsoumakas, Ioannis Katakis. Multi-Label Classification: | |
| An Overview. International Journal of Data Warehousing & Mining, | |
| 3(3), 1-13, July-September 2007. | |
| .. [2] `Wikipedia entry on the Hamming distance | |
| <https://en.wikipedia.org/wiki/Hamming_distance>`_ | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import hamming_loss | |
| >>> y_pred = [1, 2, 3, 4] | |
| >>> y_true = [2, 2, 3, 4] | |
| >>> hamming_loss(y_true, y_pred) | |
| 0.25 | |
| In the multilabel case with binary label indicators: | |
| >>> import numpy as np | |
| >>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2))) | |
| 0.75 | |
| """ | |
| y_type, y_true, y_pred = _check_targets(y_true, y_pred) | |
| check_consistent_length(y_true, y_pred, sample_weight) | |
| if labels is not None: | |
| warnings.warn("The labels parameter is unused. It was" | |
| " deprecated in version 0.21 and" | |
| " will be removed in version 0.23", | |
| FutureWarning) | |
| if sample_weight is None: | |
| weight_average = 1. | |
| else: | |
| weight_average = np.mean(sample_weight) | |
| if y_type.startswith('multilabel'): | |
| n_differences = count_nonzero(y_true - y_pred, | |
| sample_weight=sample_weight) | |
| return (n_differences / | |
| (y_true.shape[0] * y_true.shape[1] * weight_average)) | |
| elif y_type in ["binary", "multiclass"]: | |
| return _weighted_sum(y_true != y_pred, sample_weight, normalize=True) | |
| else: | |
| raise ValueError("{0} is not supported".format(y_type)) | |
| def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, | |
| labels=None): | |
| """Log loss, aka logistic loss or cross-entropy loss. | |
| This is the loss function used in (multinomial) logistic regression | |
| and extensions of it such as neural networks, defined as the negative | |
| log-likelihood of the true labels given a probabilistic classifier's | |
| predictions. The log loss is only defined for two or more labels. | |
| For a single sample with true label yt in {0,1} and | |
| estimated probability yp that yt = 1, the log loss is | |
| -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp)) | |
| Read more in the :ref:`User Guide <log_loss>`. | |
| Parameters | |
| ---------- | |
| y_true : array-like or label indicator matrix | |
| Ground truth (correct) labels for n_samples samples. | |
| y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,) | |
| Predicted probabilities, as returned by a classifier's | |
| predict_proba method. If ``y_pred.shape = (n_samples,)`` | |
| the probabilities provided are assumed to be that of the | |
| positive class. The labels in ``y_pred`` are assumed to be | |
| ordered alphabetically, as done by | |
| :class:`preprocessing.LabelBinarizer`. | |
| eps : float | |
| Log loss is undefined for p=0 or p=1, so probabilities are | |
| clipped to max(eps, min(1 - eps, p)). | |
| normalize : bool, optional (default=True) | |
| If true, return the mean loss per sample. | |
| Otherwise, return the sum of the per-sample losses. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| labels : array-like, optional (default=None) | |
| If not provided, labels will be inferred from y_true. If ``labels`` | |
| is ``None`` and ``y_pred`` has shape (n_samples,) the labels are | |
| assumed to be binary and are inferred from ``y_true``. | |
| .. versionadded:: 0.18 | |
| Returns | |
| ------- | |
| loss : float | |
| Examples | |
| -------- | |
| >>> from sklearn.metrics import log_loss | |
| >>> log_loss(["spam", "ham", "ham", "spam"], | |
| ... [[.1, .9], [.9, .1], [.8, .2], [.35, .65]]) | |
| 0.21616... | |
| References | |
| ---------- | |
| C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer, | |
| p. 209. | |
| Notes | |
| ----- | |
| The logarithm used is the natural logarithm (base-e). | |
| """ | |
| y_pred = check_array(y_pred, ensure_2d=False) | |
| check_consistent_length(y_pred, y_true, sample_weight) | |
| lb = LabelBinarizer() | |
| if labels is not None: | |
| lb.fit(labels) | |
| else: | |
| lb.fit(y_true) | |
| if len(lb.classes_) == 1: | |
| if labels is None: | |
| raise ValueError('y_true contains only one label ({0}). Please ' | |
| 'provide the true labels explicitly through the ' | |
| 'labels argument.'.format(lb.classes_[0])) | |
| else: | |
| raise ValueError('The labels array needs to contain at least two ' | |
| 'labels for log_loss, ' | |
| 'got {0}.'.format(lb.classes_)) | |
| transformed_labels = lb.transform(y_true) | |
| if transformed_labels.shape[1] == 1: | |
| transformed_labels = np.append(1 - transformed_labels, | |
| transformed_labels, axis=1) | |
| # Clipping | |
| y_pred = np.clip(y_pred, eps, 1 - eps) | |
| # If y_pred is of single dimension, assume y_true to be binary | |
| # and then check. | |
| if y_pred.ndim == 1: | |
| y_pred = y_pred[:, np.newaxis] | |
| if y_pred.shape[1] == 1: | |
| y_pred = np.append(1 - y_pred, y_pred, axis=1) | |
| # Check if dimensions are consistent. | |
| transformed_labels = check_array(transformed_labels) | |
| if len(lb.classes_) != y_pred.shape[1]: | |
| if labels is None: | |
| raise ValueError("y_true and y_pred contain different number of " | |
| "classes {0}, {1}. Please provide the true " | |
| "labels explicitly through the labels argument. " | |
| "Classes found in " | |
| "y_true: {2}".format(transformed_labels.shape[1], | |
| y_pred.shape[1], | |
| lb.classes_)) | |
| else: | |
| raise ValueError('The number of classes in labels is different ' | |
| 'from that in y_pred. Classes found in ' | |
| 'labels: {0}'.format(lb.classes_)) | |
| # Renormalize | |
| y_pred /= y_pred.sum(axis=1)[:, np.newaxis] | |
| loss = -(transformed_labels * np.log(y_pred)).sum(axis=1) | |
| return _weighted_sum(loss, sample_weight, normalize) | |
| def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None): | |
| """Average hinge loss (non-regularized) | |
| In binary class case, assuming labels in y_true are encoded with +1 and -1, | |
| when a prediction mistake is made, ``margin = y_true * pred_decision`` is | |
| always negative (since the signs disagree), implying ``1 - margin`` is | |
| always greater than 1. The cumulated hinge loss is therefore an upper | |
| bound of the number of mistakes made by the classifier. | |
| In multiclass case, the function expects that either all the labels are | |
| included in y_true or an optional labels argument is provided which | |
| contains all the labels. The multilabel margin is calculated according | |
| to Crammer-Singer's method. As in the binary case, the cumulated hinge loss | |
| is an upper bound of the number of mistakes made by the classifier. | |
| Read more in the :ref:`User Guide <hinge_loss>`. | |
| Parameters | |
| ---------- | |
| y_true : array, shape = [n_samples] | |
| True target, consisting of integers of two values. The positive label | |
| must be greater than the negative label. | |
| pred_decision : array, shape = [n_samples] or [n_samples, n_classes] | |
| Predicted decisions, as output by decision_function (floats). | |
| labels : array, optional, default None | |
| Contains all the labels for the problem. Used in multiclass hinge loss. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| Returns | |
| ------- | |
| loss : float | |
| References | |
| ---------- | |
| .. [1] `Wikipedia entry on the Hinge loss | |
| <https://en.wikipedia.org/wiki/Hinge_loss>`_ | |
| .. [2] Koby Crammer, Yoram Singer. On the Algorithmic | |
| Implementation of Multiclass Kernel-based Vector | |
| Machines. Journal of Machine Learning Research 2, | |
| (2001), 265-292 | |
| .. [3] `L1 AND L2 Regularization for Multiclass Hinge Loss Models | |
| by Robert C. Moore, John DeNero. | |
| <http://www.ttic.edu/sigml/symposium2011/papers/ | |
| Moore+DeNero_Regularization.pdf>`_ | |
| Examples | |
| -------- | |
| >>> from sklearn import svm | |
| >>> from sklearn.metrics import hinge_loss | |
| >>> X = [[0], [1]] | |
| >>> y = [-1, 1] | |
| >>> est = svm.LinearSVC(random_state=0) | |
| >>> est.fit(X, y) | |
| LinearSVC(random_state=0) | |
| >>> pred_decision = est.decision_function([[-2], [3], [0.5]]) | |
| >>> pred_decision | |
| array([-2.18..., 2.36..., 0.09...]) | |
| >>> hinge_loss([-1, 1, 1], pred_decision) | |
| 0.30... | |
| In the multiclass case: | |
| >>> import numpy as np | |
| >>> X = np.array([[0], [1], [2], [3]]) | |
| >>> Y = np.array([0, 1, 2, 3]) | |
| >>> labels = np.array([0, 1, 2, 3]) | |
| >>> est = svm.LinearSVC() | |
| >>> est.fit(X, Y) | |
| LinearSVC() | |
| >>> pred_decision = est.decision_function([[-1], [2], [3]]) | |
| >>> y_true = [0, 2, 3] | |
| >>> hinge_loss(y_true, pred_decision, labels) | |
| 0.56... | |
| """ | |
| check_consistent_length(y_true, pred_decision, sample_weight) | |
| pred_decision = check_array(pred_decision, ensure_2d=False) | |
| y_true = column_or_1d(y_true) | |
| y_true_unique = np.unique(y_true) | |
| if y_true_unique.size > 2: | |
| if (labels is None and pred_decision.ndim > 1 and | |
| (np.size(y_true_unique) != pred_decision.shape[1])): | |
| raise ValueError("Please include all labels in y_true " | |
| "or pass labels as third argument") | |
| if labels is None: | |
| labels = y_true_unique | |
| le = LabelEncoder() | |
| le.fit(labels) | |
| y_true = le.transform(y_true) | |
| mask = np.ones_like(pred_decision, dtype=bool) | |
| mask[np.arange(y_true.shape[0]), y_true] = False | |
| margin = pred_decision[~mask] | |
| margin -= np.max(pred_decision[mask].reshape(y_true.shape[0], -1), | |
| axis=1) | |
| else: | |
| # Handles binary class case | |
| # this code assumes that positive and negative labels | |
| # are encoded as +1 and -1 respectively | |
| pred_decision = column_or_1d(pred_decision) | |
| pred_decision = np.ravel(pred_decision) | |
| lbin = LabelBinarizer(neg_label=-1) | |
| y_true = lbin.fit_transform(y_true)[:, 0] | |
| try: | |
| margin = y_true * pred_decision | |
| except TypeError: | |
| raise TypeError("pred_decision should be an array of floats.") | |
| losses = 1 - margin | |
| # The hinge_loss doesn't penalize good enough predictions. | |
| np.clip(losses, 0, None, out=losses) | |
| return np.average(losses, weights=sample_weight) | |
| def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None): | |
| """Compute the Brier score. | |
| The smaller the Brier score, the better, hence the naming with "loss". | |
| Across all items in a set N predictions, the Brier score measures the | |
| mean squared difference between (1) the predicted probability assigned | |
| to the possible outcomes for item i, and (2) the actual outcome. | |
| Therefore, the lower the Brier score is for a set of predictions, the | |
| better the predictions are calibrated. Note that the Brier score always | |
| takes on a value between zero and one, since this is the largest | |
| possible difference between a predicted probability (which must be | |
| between zero and one) and the actual outcome (which can take on values | |
| of only 0 and 1). The Brier loss is composed of refinement loss and | |
| calibration loss. | |
| The Brier score is appropriate for binary and categorical outcomes that | |
| can be structured as true or false, but is inappropriate for ordinal | |
| variables which can take on three or more values (this is because the | |
| Brier score assumes that all possible outcomes are equivalently | |
| "distant" from one another). Which label is considered to be the positive | |
| label is controlled via the parameter pos_label, which defaults to 1. | |
| Read more in the :ref:`User Guide <calibration>`. | |
| Parameters | |
| ---------- | |
| y_true : array, shape (n_samples,) | |
| True targets. | |
| y_prob : array, shape (n_samples,) | |
| Probabilities of the positive class. | |
| sample_weight : array-like of shape (n_samples,), default=None | |
| Sample weights. | |
| pos_label : int or str, default=None | |
| Label of the positive class. | |
| Defaults to the greater label unless y_true is all 0 or all -1 | |
| in which case pos_label defaults to 1. | |
| Returns | |
| ------- | |
| score : float | |
| Brier score | |
| Examples | |
| -------- | |
| >>> import numpy as np | |
| >>> from sklearn.metrics import brier_score_loss | |
| >>> y_true = np.array([0, 1, 1, 0]) | |
| >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"]) | |
| >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3]) | |
| >>> brier_score_loss(y_true, y_prob) | |
| 0.037... | |
| >>> brier_score_loss(y_true, 1-y_prob, pos_label=0) | |
| 0.037... | |
| >>> brier_score_loss(y_true_categorical, y_prob, pos_label="ham") | |
| 0.037... | |
| >>> brier_score_loss(y_true, np.array(y_prob) > 0.5) | |
| 0.0 | |
| References | |
| ---------- | |
| .. [1] `Wikipedia entry for the Brier score. | |
| <https://en.wikipedia.org/wiki/Brier_score>`_ | |
| """ | |
| y_true = column_or_1d(y_true) | |
| y_prob = column_or_1d(y_prob) | |
| assert_all_finite(y_true) | |
| assert_all_finite(y_prob) | |
| check_consistent_length(y_true, y_prob, sample_weight) | |
| labels = np.unique(y_true) | |
| if len(labels) > 2: | |
| raise ValueError("Only binary classification is supported. " | |
| "Labels in y_true: %s." % labels) | |
| if y_prob.max() > 1: | |
| raise ValueError("y_prob contains values greater than 1.") | |
| if y_prob.min() < 0: | |
| raise ValueError("y_prob contains values less than 0.") | |
| # if pos_label=None, when y_true is in {-1, 1} or {0, 1}, | |
| # pos_label is set to 1 (consistent with precision_recall_curve/roc_curve), | |
| # otherwise pos_label is set to the greater label | |
| # (different from precision_recall_curve/roc_curve, | |
| # the purpose is to keep backward compatibility). | |
| if pos_label is None: | |
| if (np.array_equal(labels, [0]) or | |
| np.array_equal(labels, [-1])): | |
| pos_label = 1 | |
| else: | |
| pos_label = y_true.max() | |
| y_true = np.array(y_true == pos_label, int) | |
| return np.average((y_true - y_prob) ** 2, weights=sample_weight) |