diff --git a/docs/background.rst b/docs/background.rst index 93b0c4d..46708a6 100644 --- a/docs/background.rst +++ b/docs/background.rst @@ -45,6 +45,21 @@ As example consider this Python code: This counts as 1 line of code and 3 lines of comments. The line with ``pass`` is considered a "no operation" and thus not taken into account. +.. _binary: + +Binary files +------------ + +When a file is considered to be binary when all of the following conditions +match: + +1. The file does not start with a BOM for UTF-8, UTF-16 or UTF-32 (which + indicates text files). +2. The initial 8192 bytes contain at least one 0 byte. + +In this case, pygount assigns it the pseudo language ``__binary__`` and +performs no further analysis. + Comparison with other tools ----------------------------------- diff --git a/docs/changes.rst b/docs/changes.rst index c08317d..938ef25 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -7,6 +7,9 @@ This chapter describes the changes coming with each new version of pygount. Version 1.3.0, 2022-xx-xx +* Added JSON as additional output :option:`--format`, see :doc:`json` for + details (issue `#62 `_). + * Changed build process to `poetry `_ to change several messy configuration files into a single even more messy configuration file. diff --git a/docs/index.rst b/docs/index.rst index 5293b68..692f0d7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,7 @@ code is available from https://github.com/roskakori/pygount. installation usage continuous-integration + json background api contributing diff --git a/docs/json.rst b/docs/json.rst new file mode 100644 index 0000000..9fb5635 --- /dev/null +++ b/docs/json.rst @@ -0,0 +1,119 @@ +JSON +==== + +.. program:: pygount + +The JavaScript objects notation (JSON) is widely used to interchange data. +Running pygount with :option:`--format` "json" is a simple way to provide +the results of an analysis for further processing. + + +General format +-------------- + +The general structure of the resulting JSON is: + +.. code-block:: JavaScript + + { + "formatVersion": "1.0.0", + "pygountVersion": "1.3.0", + "files": [...], + "languages": [...], + "runtime": {...}, + "summary": {...} + } + +The naming of the entries deliberately uses camel case to conform to the +`JSLint `_ guidelines. + +Both ``formatVersion`` and ``pygountVersion`` use +`semantic versioning `_. The other entries contain the following information: + +With ``files`` you can access a list of files analyzed, for example: + +.. code-block:: JavaScript + + { + "path": "/Users/someone/workspace/pygount/pygount/write.py", + "sourceCount": 253, + "emptyCount": 60, + "documentationCount": 27, + "group": "pygount", + "isCountable": true, + "language": "Python", + "state": "analyzed", + "stateInfo": null + } + +Here, ``sourceCount`` is the number of source lines of code (SLOC), +``documentationCount`` the number of lines containing comments and +``emptyCount`` the number of empty lines (which includes "no operation" +lines). + +The ``state`` can have one of the following values: + +* analyzed: successfully analyzed +* binary: the file is a :ref:`binary file ` +* duplicate: the file is a :ref:`duplicate ` of another +* empty: the file is empty (file size = 0) +* error: the source could not be parsed; in this case, ``stateInfo`` + contains a message with more details +* generated: the file has been generated as specified with :option:`--generated` +* unknown: pygments does not offer any lexer to analyze the file + +In ``languages`` the summary for each language is available, for example: + +.. code-block:: JavaScript + + { + "documentationCount": 406, + "emptyCount": 631, + "fileCount": 18, + "isPseudoLanguage": false, + "language": "Python", + "sourceCount": 2332 + } + +In ``summary`` the total counts across the whole project can be accessed, for +example: + +.. code-block:: JavaScript + + "summary": { + "totalDocumentationCount": 410, + "totalEmptyCount": 869, + "totalFileCount": 32, + "totalSourceCount": 2930 + } + +The ``runtime`` entry collects general information about how well pygount performed +in collecting the information, for example: + +.. code-block:: JavaScript + + "runtime": { + "durationInSeconds": 0.712625, + "filesPerSecond": 44.904402736362044 + "finishedAt": "2022-01-05T11:49:27.009310", + "linesPerSecond": 5906.332222417121, + "startedAt": "2022-01-05T11:49:26.296685", + } + + +Pretty printing +--------------- + +Because the output is concise and consequently mostly illegible for a +human reader, you might want to pipe it through a pretty printer. As you +already have python installed, the easiest way is: + +.. code-block:: sh + + pygount --format json | python -m json.tool + +Another alternativ would be `jq `_: + +.. code-block:: sh + + pygount --format json | jq . diff --git a/docs/usage.rst b/docs/usage.rst index 689d84d..2a5a053 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -36,6 +36,12 @@ To limit the analysis on certain file types, you can specify a comma separated list of suffixes to take into account, for example ``--suffix=py,sql,xml``. .. option:: --out FILE + +By default the result of the analysis are written to the standard output. To +redirect the output to a file, use for example ``--out=counts.txt``. + +To explicitly redirect to the standard output specify ``--out=STDOUT``. + .. option:: --format FORMAT By default the result of the analysis are written to the standard output in a @@ -64,6 +70,9 @@ overview and a sum total. For example pygount's summary looks like this:: The summary output is designed for human readers and the column widths adjust to the data. +For further processing the results of pygount, ``--format=json`` should be the +easiest to deal with. For more information see :doc:`json`. + Patterns -------- @@ -90,6 +99,8 @@ So for example to specify that generated code can also contain the German word ``--generated="[regex][...](?i).*generiert"``. +.. _duplicates: + Counting duplicates ------------------- @@ -154,16 +165,13 @@ Pseudo languages If a source code is not counted, the number of lines is 0 and the language shown is a pseudo language indicating the reason: -* ``__binary__`` - the source code is a binary file; the detection of binary files - first ensures that file does not start with a BOM for UTF-8, UTF-16 or - UTF-32 (which indicates text files). After that it checks for zero bytes - within the initial 8192 bytes of the file. +* ``__binary__`` - used for :ref:`binary`. * ``__duplicate__`` - the source code duplicate as described at the command line option :option:`--duplicates`. * ``__empty__`` - the source code is an empty file with a size of 0 bytes. * ``__error__`` - the source code could not be parsed e.g. due to an I/O error. * ``__generated__`` - the source code is generated according to the command line - option ``--generated``. + option :option:`--generated`. * ``__unknown__`` - pygments does not provide a lexer to parse the source code. diff --git a/pygount/analysis.py b/pygount/analysis.py index 7870c14..6e1dbae 100644 --- a/pygount/analysis.py +++ b/pygount/analysis.py @@ -391,6 +391,11 @@ def string_count(self) -> int: """number of lines containing only strings but no other code""" return self._string + @property + def source_count(self) -> int: + """number of source lines of code (the sum of code_count and string_count)""" + return self.code_count + self.string_count + @property def code(self) -> int: # TODO #47: Remove deprecated property. diff --git a/pygount/command.py b/pygount/command.py index 12de360..8bb71b4 100644 --- a/pygount/command.py +++ b/pygount/command.py @@ -14,7 +14,7 @@ import pygount.write #: Valid formats for option --format. -VALID_OUTPUT_FORMATS = ("cloc-xml", "sloccount", "summary") +VALID_OUTPUT_FORMATS = ("cloc-xml", "json", "sloccount", "summary") _DEFAULT_ENCODING = "automatic" _DEFAULT_OUTPUT_FORMAT = "sloccount" @@ -55,6 +55,7 @@ _OUTPUT_FORMAT_TO_WRITER_CLASS_MAP = { "cloc-xml": pygount.write.ClocXmlWriter, + "json": pygount.write.JsonWriter, "sloccount": pygount.write.LineWriter, "summary": pygount.write.SummaryWriter, } diff --git a/pygount/summary.py b/pygount/summary.py index 0fae10d..783b60a 100644 --- a/pygount/summary.py +++ b/pygount/summary.py @@ -57,6 +57,11 @@ def string_count(self) -> int: """sum number of lines containing only strings for this language""" return self._string_count + @property + def source_count(self) -> int: + """sum number of source lines of code""" + return self.code_count + self.string_count + @property def is_pseudo_language(self) -> bool: """``True`` if the language is not a real programming language""" @@ -127,14 +132,16 @@ def total_documentation_count(self) -> int: @property def total_empty_count(self) -> int: - return self._total_empty_count @property def total_string_count(self) -> int: - return self._total_string_count + @property + def total_source_count(self) -> int: + return self.total_code_count + self.total_string_count + @property def total_file_count(self) -> int: return self._total_file_count diff --git a/pygount/write.py b/pygount/write.py index 599120a..7d30556 100644 --- a/pygount/write.py +++ b/pygount/write.py @@ -4,12 +4,21 @@ # Copyright (c) 2016-2022, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. import datetime +import json import math import os from xml.etree import ElementTree +import pygount + +from . import SourceAnalysis from .summary import ProjectSummary +#: Version of cloc the --format=cloc-xml pretends to be. +CLOC_VERSION = "1.60" + +JSON_FORMAT_VERSION = "1.0.0" + class BaseWriter: def __init__(self, target_stream): @@ -19,12 +28,12 @@ def __init__(self, target_stream): except AttributeError: self.target_name = "" self.project_summary = ProjectSummary() - self.start_time = datetime.datetime.now() - self.end_time = None - self.sources_per_second = 0 + self.started_at = datetime.datetime.utcnow() + self.finished_at = None + self.files_per_second = 0 self.lines_per_second = 0 self.duration = None - self.duration_in_seconds = None + self.duration_in_seconds = 0.0 def __enter__(self): return self @@ -36,14 +45,14 @@ def add(self, source_analysis): self.project_summary.add(source_analysis) def close(self): - self.end_time = datetime.datetime.now() - self.duration = self.end_time - self.start_time + self.finished_at = datetime.datetime.utcnow() + self.duration = self.finished_at - self.started_at self.duration_in_seconds = ( (1e-6 * self.duration.microseconds) + self.duration.seconds + self.duration.days * 3600 * 24 ) if self.duration_in_seconds > 0: self.lines_per_second = self.project_summary.total_file_count / self.duration_in_seconds - self.sources_per_second = self.project_summary.total_file_count / self.duration_in_seconds + self.files_per_second = self.project_summary.total_file_count / self.duration_in_seconds class LineWriter(BaseWriter): @@ -75,7 +84,7 @@ def __init__(self, target_stream): self._results_element = ElementTree.Element("results") self._header_element = ElementTree.SubElement(self._results_element, "header") ElementTree.SubElement(self._header_element, "cloc_url", text="https://github.com/roskakori/pygount") - ElementTree.SubElement(self._header_element, "cloc_version", text="1.60") + ElementTree.SubElement(self._header_element, "cloc_version", text=CLOC_VERSION) self._files_element = ElementTree.SubElement(self._results_element, "files") def __exit__(self, exc_type, exc_val, exc_tb): @@ -83,11 +92,11 @@ def __exit__(self, exc_type, exc_val, exc_tb): # Only write the XML if everything works out. self.close() - def add(self, source_analysis): + def add(self, source_analysis: SourceAnalysis): super().add(source_analysis) file_attributes = { "blank": str(source_analysis.empty_count), - "code": str(source_analysis.code_count + source_analysis.string_count), + "code": str(source_analysis.source_count), "comment": str(source_analysis.documentation_count), "language": source_analysis.language, "name": source_analysis.path, @@ -100,7 +109,7 @@ def close(self): ElementTree.SubElement(self._header_element, "elapsed_seconds", text="%.f" % self.duration_in_seconds) ElementTree.SubElement(self._header_element, "n_files", text="%d" % self.project_summary.total_file_count) ElementTree.SubElement(self._header_element, "n_lines", text="%d" % self.project_summary.total_line_count) - ElementTree.SubElement(self._header_element, "files_per_second", text="%.f" % self.sources_per_second) + ElementTree.SubElement(self._header_element, "files_per_second", text="%.f" % self.files_per_second) ElementTree.SubElement(self._header_element, "lines_per_second", text="%.f" % self.lines_per_second) ElementTree.SubElement(self._header_element, "report_file", text=self.target_name) @@ -269,6 +278,66 @@ def close(self): self._target_stream.write(summary_footer + os.linesep) +class JsonWriter(BaseWriter): + """ + Writer JSON output, ideal for further automatic processing. + """ + + def __init__(self, target_stream): + super().__init__(target_stream) + self.source_analyses = [] + + def add(self, source_analysis: SourceAnalysis): + super().add(source_analysis) + self.source_analyses.append( + { + "path": source_analysis.path, + "sourceCount": source_analysis.source_count, + "emptyCount": source_analysis.empty_count, + "documentationCount": source_analysis.documentation_count, + "group": source_analysis.group, + "isCountable": source_analysis.is_countable, + "language": source_analysis.language, + "state": source_analysis.state.name, + "stateInfo": source_analysis.state_info, + } + ) + + def close(self): + # NOTE: We are using camel case for naming here to follow JSLint's guidelines, see . + super().close() + json_map = { + "formatVersion": JSON_FORMAT_VERSION, + "pygountVersion": pygount.__version__, + "files": self.source_analyses, + "languages": [ + { + "documentationCount": language_summary.documentation_count, + "emptyCount": language_summary.empty_count, + "fileCount": language_summary.file_count, + "isPseudoLanguage": language_summary.is_pseudo_language, + "language": language_summary.language, + "sourceCount": language_summary.source_count, + } + for language_summary in self.project_summary.language_to_language_summary_map.values() + ], + "runtime": { + "durationInSeconds": self.duration_in_seconds, + "filesPerSecond": self.files_per_second, + "finishedAt": self.finished_at.isoformat(), + "linesPerSecond": self.lines_per_second, + "startedAt": self.started_at.isoformat(), + }, + "summary": { + "totalDocumentationCount": self.project_summary.total_documentation_count, + "totalEmptyCount": self.project_summary.total_empty_count, + "totalFileCount": self.project_summary.total_file_count, + "totalSourceCount": self.project_summary.total_source_count, + }, + } + json.dump(json_map, self._target_stream) + + def digit_width(line_count): assert line_count >= 0 return math.ceil(math.log10(line_count + 1)) if line_count != 0 else 1 diff --git a/tests/test_command.py b/tests/test_command.py index 17d7265..8fa684f 100644 --- a/tests/test_command.py +++ b/tests/test_command.py @@ -3,15 +3,18 @@ """ # Copyright (c) 2016-2022, Thomas Aglassinger. # All rights reserved. Distributed under the BSD License. +import json import os import tempfile from xml.etree import ElementTree import pytest +import pygount from pygount import command from pygount.command import VALID_OUTPUT_FORMATS, Command from pygount.common import OptionError +from pygount.write import JSON_FORMAT_VERSION from ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest @@ -149,6 +152,22 @@ def test_can_analyze_pygount_source_code_as_cloc_xml(self): assert file_elements is not None assert len(file_elements) >= 1 + def test_can_analyze_pygount_source_code_as_json(self): + pygount_json_path = os.path.join(self.tests_temp_folder, "pygount.json") + exit_code = command.pygount_command( + ["--verbose", "--format", "json", "--out", pygount_json_path, PYGOUNT_SOURCE_FOLDER] + ) + assert exit_code == 0 + assert os.path.exists(pygount_json_path) + with open(pygount_json_path, encoding="utf-8") as pygount_json_file: + json_map = json.load(pygount_json_file) + assert json_map.get("pygountVersion") == pygount.__version__ + assert json_map.get("formatVersion") == JSON_FORMAT_VERSION + assert "files" in json_map + assert "languages" in json_map + assert "runtime" in json_map + assert "summary" in json_map + def test_can_detect_duplicates(self): source_code = "# Duplicate source\nprint('duplicate code')\n" original_path = os.path.join(self.tests_temp_folder, "original.py")