Merge pull request #147 from roskakori/105-add-option-to-merge-embedd…

…ed-languages #105 Add option to merge embedded languages
roskakori · May 12, 2024 · 8127879 · 8127879
2 parents d671461 + 4676492
commit 8127879
Show file tree

Hide file tree

Showing 8 changed files with 130 additions and 23 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -43,7 +43,7 @@ jobs:
         if: ${{ matrix.python-version == env.MAIN_PYTHON_VERSION }}
         run: |
           poetry run sh scripts/build_documentation.sh
-      - name: Update coveralls
+      - name: Update coveralls statistics
         if: ${{ matrix.python-version == env.MAIN_PYTHON_VERSION }}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -5,8 +5,11 @@ Changes
 
 This chapter describes the changes coming with each new version of pygount.
 
-Version 1.7.0, 2023-07-02
+Version 1.7.0, 2024-05-13
 
+* Add command line option ``--merge-embedded-languages`` to merge embedded
+  languages into their base language. For example, "HTML+Django/Jinja" counts
+  as "HTML" (issue `#105 <https://github.com/roskakori/pygount/issues/105>`_).
 * Add Python 3.12 and made it the main version for CI (issue
   `#145 <https://github.com/roskakori/pygount/issues/145>`_).
 

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -73,6 +73,22 @@ to the data.
 For further processing the results of pygount, ``--format=json`` should be the
 easiest to deal with. For more information see :doc:`json`.
 
+.. option:: --merge-embedded-languages
+
+Some languages such as HTML or JavaScript allow to embed other languages in their source code. In that case, the source code is assigned to a language
+that contains both the base and end embedded language in its name, for example:
+
+- HTML+Jinja
+- JavaScript+Lasso
+
+If you prefer count all variants of a base language only under its own name,
+specify ``--merge-embedded-languages``. The example above will then show as:
+
+- HTML
+- JavaScript
+
+Consequently, multiple different embedded languages will all count for its
+common base language.
 
 Remote repositories
 -------------------

diff --git a/pygount/analysis.py b/pygount/analysis.py
@@ -14,7 +14,7 @@
 import re
 from enum import Enum
 from io import SEEK_CUR, BufferedIOBase, IOBase, RawIOBase, TextIOBase
-from typing import Dict, Iterator, List, Optional, Pattern, Sequence, Set, Tuple, Union
+from typing import Iterator, List, Optional, Pattern, Sequence, Set, Tuple, Union
 
 import pygments.lexer
 import pygments.lexers
@@ -48,6 +48,8 @@
 #: Pygments token type; we need to define our own type because pygments' ``_TokenType`` is internal.
 TokenType = type(pygments.token.Token)
 
+_BASE_LANGUAGE_REGEX = re.compile(r"^(?P<base_language>[^+]+)\+[^+].*$")
+
 
 class SourceState(Enum):
     """
@@ -113,7 +115,7 @@ class SourceState(Enum):
     "news",
     "readme",
     "thanks",
-    # Github community recommendations, see
+    # GitHub community recommendations, see
     # <https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions>.
     # By now, in practice most projects use a suffix like "*.md" but some older ones
     # still might have such files without suffix.
@@ -246,12 +248,9 @@ def from_state(
     @staticmethod
     def _check_state_info(state: SourceState, state_info: Optional[str]):
         states_that_require_state_info = [SourceState.duplicate, SourceState.error, SourceState.generated]
-        assert (state in states_that_require_state_info) == (
-            state_info is not None
-        ), "state={} and state_info={} but state_info must be specified for the following states: {}".format(
-            state,
-            state_info,
-            states_that_require_state_info,
+        assert (state in states_that_require_state_info) == (state_info is not None), (
+            f"state={state} and state_info={state_info} "
+            f"but state_info must be specified for the following states: {states_that_require_state_info}"
         )
 
     @staticmethod
@@ -260,16 +259,17 @@ def from_file(
         group: str,
         encoding: str = "automatic",
         fallback_encoding: str = "cp1252",
-        generated_regexes=pygount.common.regexes_from(DEFAULT_GENERATED_PATTERNS_TEXT),
+        generated_regexes: Optional[List[Pattern]] = None,
         duplicate_pool: Optional[DuplicatePool] = None,
         file_handle: Optional[IOBase] = None,
+        merge_embedded_language: bool = False,
     ) -> "SourceAnalysis":
         """
         Factory method to create a :py:class:`SourceAnalysis` by analyzing
         the source code in ``source_path`` or the open file ``file_handle``.
 
         :param source_path: path to source code to analyze
-        :param group: name of a logical group the sourc code belongs to, e.g. a
+        :param group: name of a logical group the source code belongs to, e.g. a
           package.
         :param encoding: encoding according to :func:`encoding_for`
         :param fallback_encoding: fallback encoding according to
@@ -281,9 +281,11 @@ def from_file(
         :param file_handle: a file-like object, or ``None`` to read and open the file from
           ``source_path``. If the file is open in text mode, it must be opened with the correct
           encoding.
+        :param merge_embedded_language: If pygments detects a base and embedded language, the source
+          code counts towards the base language. For example: "JavaScript+Lasso" counts as
+          "JavaScript".
         """
         assert encoding is not None
-        assert generated_regexes is not None
 
         result = None
         lexer = None
@@ -323,8 +325,15 @@ def from_file(
             if result is None:
                 lexer = guess_lexer(source_path, source_code)
                 assert lexer is not None
-        if (result is None) and (len(generated_regexes) != 0):
-            number_line_and_regex = matching_number_line_and_regex(pygount.common.lines(source_code), generated_regexes)
+        actual_generated_regexes = (
+            generated_regexes
+            if generated_regexes is not None
+            else pygount.common.regexes_from(DEFAULT_GENERATED_PATTERNS_TEXT)
+        )
+        if (result is None) and (len(actual_generated_regexes) != 0):
+            number_line_and_regex = matching_number_line_and_regex(
+                pygount.common.lines(source_code), actual_generated_regexes
+            )
             if number_line_and_regex is not None:
                 number, _, regex = number_line_and_regex
                 message = f"line {number} matches {regex}"
@@ -333,7 +342,7 @@ def from_file(
         if result is None:
             assert lexer is not None
             assert source_code is not None
-            language = lexer.name
+            language = base_language(lexer.name) if merge_embedded_language else lexer.name
             if ("xml" in language.lower()) or (language == "Genshi"):
                 dialect = pygount.xmldialect.xml_dialect(source_path, source_code)
                 if dialect is not None:
@@ -452,7 +461,7 @@ def state_info(self) -> Optional[Union[str, Exception]]:
           the :py:attr:`path` is a duplicate of
         * :py:attr:`SourceState.error`: the :py:exc:`Exception` causing the
           error
-        * :py:attr:`SourceState.generated`: a human readable explanation why
+        * :py:attr:`SourceState.generated`: a human-readable explanation why
           the file is considered to be generated
         """
         return self._state_info
@@ -625,7 +634,7 @@ def source_paths(self) -> Iterator[str]:
 
 
 def matching_number_line_and_regex(
-    source_lines: Sequence[str], generated_regexes: Sequence[Pattern], max_line_count: int = 15
+    source_lines: Iterator[str], generated_regexes: Sequence[Pattern], max_line_count: int = 15
 ) -> Optional[Tuple[int, str, Pattern]]:
     """
     The first line and its number (starting with 0) in the source code that
@@ -661,7 +670,7 @@ def white_characters(language_id: str) -> str:
     return "(),:;[]{}"
 
 
-def white_code_words(language_id: str) -> Dict[str, List[str]]:
+def white_code_words(language_id: str) -> Set[str]:
     """
     Words that do not count as code if it is the only word in a line.
     """
@@ -683,7 +692,7 @@ def _delined_tokens(tokens: Sequence[Tuple[TokenType, str]]) -> Iterator[TokenTy
 
 def _pythonized_comments(tokens: Sequence[Tuple[TokenType, str]]) -> Iterator[TokenType]:
     """
-    Similar to tokens but converts strings after a colon (:) to comments.
+    Similar to tokens but converts strings after a colon (`:`) to comments.
     """
     is_after_colon = True
     for token_type, token_text in tokens:
@@ -890,7 +899,19 @@ def source_analysis(
     group,
     encoding="automatic",
     fallback_encoding="cp1252",
-    generated_regexes=pygount.common.regexes_from(DEFAULT_GENERATED_PATTERNS_TEXT),
+    generated_regexes: Optional[List[Pattern]] = None,
     duplicate_pool: Optional[DuplicatePool] = None,
 ):
-    return SourceAnalysis.from_file(source_path, group, encoding, fallback_encoding, generated_regexes, duplicate_pool)
+    actual_generated_regexes = (
+        generated_regexes
+        if generated_regexes is not None
+        else pygount.common.regexes_from(DEFAULT_GENERATED_PATTERNS_TEXT)
+    )
+    return SourceAnalysis.from_file(
+        source_path, group, encoding, fallback_encoding, actual_generated_regexes, duplicate_pool
+    )
+
+
+def base_language(language: str) -> str:
+    base_language_match = _BASE_LANGUAGE_REGEX.match(language)
+    return language if base_language_match is None else base_language_match.group("base_language")
diff --git a/pygount/command.py b/pygount/command.py
@@ -45,6 +45,9 @@
 _HELP_GENERATED = """comma separated list of regular expressions to detect
  generated code; default: %(default)s"""
 
+_HELP_MERGE_EMBEDDED_LANGUAGES = """merge counts for embedded languages into
+ their base language; for example, HTML+Jinja2 counts as HTML"""
+
 _HELP_FOLDERS_TO_SKIP = """comma separated list of glob patterns for folder
  names not to analyze. Use "..." as first entry to append patterns to the
  default patterns; default: %(default)s"""
@@ -102,6 +105,7 @@ def __init__(self):
         self._generated_regexs = pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_PATTERNS_TEXT)
         self._has_duplicates = False
         self._has_summary = False
+        self._has_to_merge_embedded_languages = False
         self._is_verbose = False
         self._names_to_skip = pygount.common.regexes_from(pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT)
         self._output = _DEFAULT_OUTPUT
@@ -168,6 +172,13 @@ def has_duplicates(self):
     def set_has_duplicates(self, has_duplicates, source=None):
         self._has_duplicates = bool(has_duplicates)
 
+    @property
+    def has_to_merge_embedded_languages(self):
+        return self._has_to_merge_embedded_languages
+
+    def set_has_to_merge_embedded_languages(self, has_to_merge_embedded_languages, source=None):
+        self._has_to_merge_embedded_languages = bool(has_to_merge_embedded_languages)
+
     @property
     def is_verbose(self):
         return self._is_verbose
@@ -247,6 +258,12 @@ def argument_parser(self):
             default=pygount.analysis.DEFAULT_GENERATED_PATTERNS_TEXT,
             help=_HELP_GENERATED,
         )
+        parser.add_argument(
+            "--merge-embedded-languages",
+            "-m",
+            action="store_true",
+            help=_HELP_MERGE_EMBEDDED_LANGUAGES,
+        )
         parser.add_argument(
             "--names-to-skip",
             "-N",
@@ -313,6 +330,7 @@ def apply_arguments(self, arguments=None):
         self.set_folders_to_skip(args.folders_to_skip, "option --folders-to-skip")
         self.set_generated_regexps(args.generated, "option --generated")
         self.set_has_duplicates(args.duplicates, "option --duplicates")
+        self.set_has_to_merge_embedded_languages(args.merge_embedded_languages, "option --merge-embedded-languages")
         self.set_is_verbose(args.verbose, "option --verbose")
         self.set_names_to_skip(args.names_to_skip, "option --folders-to-skip")
         self.set_output(args.out, "option --out")
@@ -346,6 +364,7 @@ def execute(self):
                                     self.fallback_encoding,
                                     generated_regexes=self._generated_regexs,
                                     duplicate_pool=duplicate_pool,
+                                    merge_embedded_language=self.has_to_merge_embedded_languages,
                                 )
                             )
                     finally:

diff --git a/tests/test_analysis.py b/tests/test_analysis.py
@@ -15,7 +15,7 @@
 
 from pygount import Error as PygountError
 from pygount import analysis, common
-from pygount.analysis import guess_lexer
+from pygount.analysis import base_language, guess_lexer
 
 from ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest
 from .test_xmldialect import EXAMPLE_ANT_CODE
@@ -230,6 +230,26 @@ def test_can_analyze_bytesio(self):
         assert source_analysis.language == "Python"
         assert source_analysis.code_count == 2
 
+    def test_can_analyze_embedded_language(self):
+        test_html_django_path = self.create_temp_file(
+            "some.html",
+            ["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
+        )
+        source_analysis = analysis.SourceAnalysis.from_file(test_html_django_path, "test", encoding="utf-8")
+        assert source_analysis.language.lower() == "html+django/jinja"
+        assert source_analysis.code_count == 3
+
+    def test_can_merge_embedded_language(self):
+        test_html_django_path = self.create_temp_file(
+            "some.html",
+            ["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
+        )
+        source_analysis = analysis.SourceAnalysis.from_file(
+            test_html_django_path, "test", encoding="utf-8", merge_embedded_language=True
+        )
+        assert source_analysis.language.lower() == "html"
+        assert source_analysis.code_count == 3
+
     def test_fails_on_non_seekable_file_handle_with_encoding_automatic(self):
         file_handle = _NonSeekableEmptyBytesIO()
 
@@ -474,6 +494,15 @@ def test_can_match_deprecated_functions():
     )
 
 
+def test_can_compute_base_language():
+    assert base_language("JavaScript") == "JavaScript"
+    assert base_language("JavaScript+Lasso") == "JavaScript"
+    assert base_language("JavaScript+") == "JavaScript+"  # no actual language
+    assert base_language("C++") == "C++"
+    assert base_language("++C") == "++C"  # no actual language
+    assert base_language("") == ""  # no actual language, but should not crash either
+
+
 class DuplicatePoolTest(TempFolderTest):
     def test_can_distinguish_different_files(self):
         some_path = self.create_temp_file(__name__ + "_some", "some")

diff --git a/tests/test_command.py b/tests/test_command.py
@@ -211,3 +211,19 @@ def test_can_write_all_output_formats(self):
         for output_format in VALID_OUTPUT_FORMATS:
             exit_code = command.pygount_command(["--format", output_format, PYGOUNT_SOURCE_FOLDER])
             self.assertEqual(exit_code, 0)
+
+    def test_can_merge_embedded_languages(self):
+        test_html_django_path = self.create_temp_file(
+            "some.html",
+            ["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
+        )
+        cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml")
+        exit_code = command.pygount_command(
+            ["--merge-embedded-languages", "--format", "cloc-xml", "--out", cloc_xml_path, test_html_django_path]
+        )
+        assert exit_code == 0
+        assert os.path.exists(cloc_xml_path)
+        cloc_xml_root = ElementTree.parse(cloc_xml_path)
+        file_elements = cloc_xml_root.findall("files/file[@language='HTML']")
+        assert file_elements is not None
+        assert len(file_elements) == 1