[Datumaro] Mean and std for dataset (cvat-ai#1734)

* Add meanstd * Add stats cli * Update changelog Co-authored-by: Nikita Manovich <40690625+nmanovic@users.noreply.github.com>
signatrix · Aug 3, 2020 · 7b99c6b · 7b99c6b
1 parent de6fd07
commit 7b99c6b
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Built-in search for labels when create an object or change a label (<https://github.com/opencv/cvat/pull/1683>)
 - Better validation of labels and attributes in raw viewer (<https://github.com/opencv/cvat/pull/1727>)
 - ClamAV antivirus integration (<https://github.com/opencv/cvat/pull/1712>)
+- [Datumaro] Added `stats` command, which shows some dataset statistics like image mean and std (https://github.com/opencv/cvat/pull/1734)
 - Add option to upload annotations upon task creation on CLI 
 - Polygon and polylines interpolation (<https://github.com/opencv/cvat/pull/1571>)
 - Ability to redraw shape from scratch (Shift + N) for an activated shape (<https://github.com/opencv/cvat/pull/1571>)

diff --git a/datumaro/datumaro/cli/contexts/project/__init__.py b/datumaro/datumaro/cli/contexts/project/__init__.py
@@ -16,6 +16,7 @@
 from datumaro.components.dataset_filter import DatasetItemEncoder
 from datumaro.components.extractor import AnnotationType
 from datumaro.components.cli_plugin import CliPlugin
+from datumaro.components.operations import mean_std
 from .diff import DiffVisualizer
 from ...util import add_subparser, CliException, MultilineFormatter, \
     make_file_name
@@ -623,6 +624,38 @@ def transform_command(args):
 
     return 0
 
+def build_stats_parser(parser_ctor=argparse.ArgumentParser):
+    parser = parser_ctor(help="Get project statistics",
+        description="""
+            Outputs project statistics.
+        """,
+        formatter_class=MultilineFormatter)
+
+    parser.add_argument('-p', '--project', dest='project_dir', default='.',
+        help="Directory of the project to operate on (default: current dir)")
+    parser.set_defaults(command=stats_command)
+
+    return parser
+
+def stats_command(args):
+    project = load_project(args.project_dir)
+    dataset = project.make_dataset()
+
+    def print_extractor_info(extractor, indent=''):
+        mean, std = mean_std(dataset)
+        print("%sImage mean:" % indent, ', '.join('%.3f' % n for n in mean))
+        print("%sImage std:" % indent, ', '.join('%.3f' % n for n in std))
+
+    print("Dataset: ")
+    print_extractor_info(dataset)
+
+    if 1 < len(dataset.subsets()):
+        print("Subsets: ")
+        for subset_name in dataset.subsets():
+            subset = dataset.get_subset(subset_name)
+            print("  %s:" % subset_name)
+            print_extractor_info(subset, " " * 4)
+
 def build_info_parser(parser_ctor=argparse.ArgumentParser):
     parser = parser_ctor(help="Get project info",
         description="""
@@ -718,5 +751,6 @@ def build_parser(parser_ctor=argparse.ArgumentParser):
     add_subparser(subparsers, 'diff', build_diff_parser)
     add_subparser(subparsers, 'transform', build_transform_parser)
     add_subparser(subparsers, 'info', build_info_parser)
+    add_subparser(subparsers, 'stats', build_stats_parser)
 
     return parser
diff --git a/datumaro/datumaro/components/operations.py b/datumaro/datumaro/components/operations.py
@@ -0,0 +1,82 @@
+
+# Copyright (C) 2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import cv2
+import numpy as np
+
+
+def mean_std(dataset):
+    """
+    Computes unbiased mean and std. dev. for dataset images, channel-wise.
+    """
+    # Use an online algorithm to:
+    # - handle different image sizes
+    # - avoid cancellation problem
+
+    stats = np.empty((len(dataset), 2, 3), dtype=np.double)
+    counts = np.empty(len(dataset), dtype=np.uint32)
+
+    mean = lambda i, s: s[i][0]
+    var = lambda i, s: s[i][1]
+
+    for i, item in enumerate(dataset):
+        counts[i] = np.prod(item.image.size)
+
+        image = item.image.data
+        if len(image.shape) == 2:
+            image = image[:, :, np.newaxis]
+        else:
+            image = image[:, :, :3]
+        # opencv is much faster than numpy here
+        cv2.meanStdDev(image.astype(np.double) / 255,
+            mean=mean(i, stats), stddev=var(i, stats))
+
+    # make variance unbiased
+    np.multiply(np.square(stats[:, 1]),
+        (counts / (counts - 1))[:, np.newaxis],
+        out=stats[:, 1])
+
+    _, mean, var = StatsCounter().compute_stats(stats, counts, mean, var)
+    return mean * 255, np.sqrt(var) * 255
+
+class StatsCounter:
+    # Implements online parallel computation of sample variance
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+
+    # Needed do avoid catastrophic cancellation in floating point computations
+    @staticmethod
+    def pairwise_stats(count_a, mean_a, var_a, count_b, mean_b, var_b):
+        delta = mean_b - mean_a
+        m_a = var_a * (count_a - 1)
+        m_b = var_b * (count_b - 1)
+        M2 = m_a + m_b + delta ** 2 * count_a * count_b / (count_a + count_b)
+        return (
+            count_a + count_b,
+            mean_a * 0.5 + mean_b * 0.5,
+            M2 / (count_a + count_b - 1)
+        )
+
+    # stats = float array of shape N, 2 * d, d = dimensions of values
+    # count = integer array of shape N
+    # mean_accessor = function(idx, stats) to retrieve element mean
+    # variance_accessor = function(idx, stats) to retrieve element variance
+    # Recursively computes total count, mean and variance, does O(log(N)) calls
+    @staticmethod
+    def compute_stats(stats, counts, mean_accessor, variance_accessor):
+        m = mean_accessor
+        v = variance_accessor
+        n = len(stats)
+        if n == 1:
+            return counts[0], m(0, stats), v(0, stats)
+        if n == 2:
+            return __class__.pairwise_stats(
+                counts[0], m(0, stats), v(0, stats),
+                counts[1], m(1, stats), v(1, stats)
+                )
+        h = n // 2
+        return __class__.pairwise_stats(
+            *__class__.compute_stats(stats[:h], counts[:h], m, v),
+            *__class__.compute_stats(stats[h:], counts[h:], m, v)
+            )
diff --git a/datumaro/tests/test_ops.py b/datumaro/tests/test_ops.py
@@ -0,0 +1,31 @@
+import numpy as np
+
+from datumaro.components.extractor import Extractor, DatasetItem
+from datumaro.components.operations import mean_std
+
+from unittest import TestCase
+
+
+class TestOperations(TestCase):
+    def test_mean_std(self):
+        expected_mean = [100, 50, 150]
+        expected_std = [20, 50, 10]
+
+        class TestExtractor(Extractor):
+            def __iter__(self):
+                return iter([
+                    DatasetItem(id=1, image=np.random.normal(
+                        expected_mean, expected_std,
+                        size=(w, h, 3))
+                    )
+                    for i, (w, h) in enumerate([
+                        (3000, 100), (800, 600), (400, 200), (700, 300)
+                    ])
+                ])
+
+        actual_mean, actual_std = mean_std(TestExtractor())
+
+        for em, am in zip(expected_mean, actual_mean):
+            self.assertAlmostEqual(em, am, places=0)
+        for estd, astd in zip(expected_std, actual_std):
+            self.assertAlmostEqual(estd, astd, places=0)