diff --git a/docs/_quarto.yml b/docs/_quarto.yml index edd83adf4..62b591348 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -113,6 +113,7 @@ quartodoc: - GT.fmt_time - GT.fmt_datetime - GT.fmt_markdown + - GT.fmt_units - GT.fmt_image - GT.fmt_nanoplot - GT.fmt @@ -158,6 +159,7 @@ quartodoc: - html - from_column - system_fonts + - define_units - nanoplot_options - title: Table options desc: > diff --git a/docs/styles.css b/docs/styles.css index c5264e6b3..de81a3547 100644 --- a/docs/styles.css +++ b/docs/styles.css @@ -56,6 +56,14 @@ p,h1,h2,h3,#toc-title,#toc-function-reference,.nav-link,.table { content: "()" } +[id^=table-options] td a:after { + content: "()" +} + +[id^=export] td a:after { + content: "()" +} + [id^=value-formatting] td a:after { content: "()" } diff --git a/great_tables/__init__.py b/great_tables/__init__.py index d8ef0f2a8..0ce2e57d7 100644 --- a/great_tables/__init__.py +++ b/great_tables/__init__.py @@ -12,7 +12,18 @@ from . import loc from . import style from ._styles import FromColumn as from_column -from ._helpers import letters, LETTERS, px, pct, md, html, random_id, system_fonts, nanoplot_options +from ._helpers import ( + letters, + LETTERS, + px, + pct, + md, + html, + random_id, + system_fonts, + define_units, + nanoplot_options, +) __all__ = ( @@ -25,6 +36,7 @@ "md", "html", "system_fonts", + "define_units", "nanoplot_options", "random_id", "from_column", diff --git a/great_tables/_formats.py b/great_tables/_formats.py index 87850aa7f..56fc80907 100644 --- a/great_tables/_formats.py +++ b/great_tables/_formats.py @@ -2114,6 +2114,163 @@ def fmt_markdown_fn(x: Any) -> str: return fmt(self, fns=fmt_markdown_fn, columns=columns, rows=rows) +def fmt_units( + self: GTSelf, + columns: SelectExpr = None, + rows: int | list[int] | None = None, + pattern: str = "{x}", +) -> GTSelf: + """ + Format measurement units. + + The `fmt_units()` method lets you better format measurement units in the table body. These must + conform to the **Great Tables** *units notation*; as an example of this, `"J Hz^-1 mol^-1"` can + be used to generate units for the *molar Planck constant*. The notation here provides several + conveniences for defining units, so as long as the values to be formatted conform to this + syntax, you'll obtain nicely-formatted inline units. Details pertaining to *units notation* can + be found in the section entitled *How to use units notation*. + + Parameters + ---------- + columns + The columns to target. Can either be a single column name or a series of column names + provided in a list. + rows + In conjunction with `columns=`, we can specify which of their rows should undergo + formatting. The default is all rows, resulting in all rows in targeted columns being + formatted. Alternatively, we can supply a list of row indices. + pattern + A formatting pattern that allows for decoration of the formatted value. The formatted value + is represented by the `{x}` (which can be used multiple times, if needed) and all other + characters will be interpreted as string literals. + + How to use units notation + ------------------------- + The **Great Tables** units notation involves a shorthand of writing units that feels familiar + and is fine-tuned for the task at hand. Each unit is treated as a separate entity (parentheses + and other symbols included) and the addition of subscript text and exponents is flexible and + relatively easy to formulate. This is all best shown with examples: + + - `"m/s"` and `"m / s"` both render as `"m/s"` + - `"m s^-1"` will appear with the `"-1"` exponent intact + - `"m /s"` gives the the same result, as `"/"` is equivalent to `"^-1"` + - `"E_h"` will render an `"E"` with the `"h"` subscript + - `"t_i^2.5"` provides a `t` with an `"i"` subscript and a `"2.5"` exponent + - `"m[_0^2]"` will use overstriking to set both scripts vertically + - `"g/L %C6H12O6%"` uses a chemical formula (enclosed in a pair of `"%"` characters) as a unit + partial, and the formula will render correctly with subscripted numbers + - Common units that are difficult to write using ASCII text may be implicitly converted to the + correct characters (e.g., the `"u"` in `"ug"`, `"um"`, `"uL"`, and `"umol"` will be converted to + the Greek *mu* symbol; `"degC"` and `"degF"` will render a degree sign before the temperature + unit) + - We can transform shorthand symbol/unit names enclosed in `":"` (e.g., `":angstrom:"`, + `":ohm:"`, etc.) into proper symbols + - Greek letters can added by enclosing the letter name in `":"`; you can use lowercase letters + (e.g., `":beta:"`, `":sigma:"`, etc.) and uppercase letters too (e.g., `":Alpha:"`, `":Zeta:"`, + etc.) + - The components of a unit (unit name, subscript, and exponent) can be fully or partially + italicized/emboldened by surrounding text with `"*"` or `"**"` + + Returns + ------- + GT + The GT object is returned. This is the same object that the method is called on so that we + can facilitate method chaining. + + Examples + -------- + Let's use the `illness` dataset and create a new table. The `units` column happens to contain + string values in *units notation* (e.g., `"x10^9 / L"`). Using the `fmt_units()` method here + will improve the formatting of those measurement units. + + ```{python} + from great_tables import GT, style, loc + from great_tables.data import illness + + ( + GT(illness, rowname_col="test") + .fmt_units(columns="units") + .fmt_number(columns=lambda x: x.startswith("day"), decimals=2, drop_trailing_zeros=True) + .tab_header(title="Laboratory Findings for the YF Patient") + .tab_spanner(label="Day", columns=lambda x: x.startswith("day")) + .tab_spanner(label="Normal Range", columns=lambda x: x.startswith("norm")) + .cols_label( + norm_l="Lower", + norm_u="Upper", + units="Units" + ) + .opt_vertical_padding(scale=0.4) + .opt_align_table_header(align="left") + .tab_options(heading_padding="10px") + .tab_style( + locations=loc.body(columns="norm_l"), + style=style.borders(sides="left") + ) + .opt_vertical_padding(scale=0.5) + ) + ``` + + The `constants` dataset contains values for hundreds of fundamental physical constants. We'll + take a subset of values that have some molar basis and generate a new display table from that. + Like the `illness` dataset, this one has a `units` column so, again, the `fmt_units()` method + will be used to format those units. Here, the preference for typesetting measurement units is to + have positive and negative exponents (e.g., not `" / "` but rather + `" ^-1"`). + + ```{python} + from great_tables.data import constants + import polars as pl + import polars.selectors as cs + + constants_mini = ( + pl.from_pandas(constants) + .filter(pl.col("name").str.contains("molar")).sort("value") + .with_columns( + name=pl.col("name") + .str.to_titlecase() + .str.replace("Kpa", "kpa") + .str.replace("Of", "of") + ) + ) + + ( + GT(constants_mini) + .cols_hide(columns=["uncert", "sf_value", "sf_uncert"]) + .fmt_units(columns="units") + .fmt_scientific(columns="value", decimals=3) + .tab_header(title="Physical Constants Having a Molar Basis") + .tab_options(column_labels_hidden=True) + ) + ``` + + See Also + -------- + The [`define_units()`](`great_tables.define_units`) function can be used as a standalone utility + for working with units notation. It can parses strings in *units notation* and can emit + formatted units with its `.to_html()` method. + """ + + def fmt_units_fn( + x: str, + pattern: str = pattern, + ): + # If the `x` value is a missing value, then return the same value + if is_na(self._tbl_data, x): + return x + + from great_tables._helpers import define_units + + x_formatted = define_units(x).to_html() + + # Use a supplied pattern specification to decorate the formatted value + if pattern != "{x}": + x_formatted = pattern.replace("{x}", x_formatted) + + return x_formatted + + return fmt(self, fns=fmt_units_fn, columns=columns, rows=rows) + + def _value_to_decimal_notation( value: int | float, decimals: int = 2, @@ -3276,12 +3433,12 @@ def fmt_image( Examples -------- - Using a small portion of [`metro`] dataset, let's create a **gt** table. We will only include a - few columns and rows from that table. The `lines` column has comma-separated listings of numbers + Using a small portion of `metro` dataset, let's create a new table. We will only include a few + columns and rows from that table. The `lines` column has comma-separated listings of numbers corresponding to lines served at each station. We have a directory of SVG graphics for all of these lines in the package (the path for the image directory can be accessed via `files("great_tables") / "data/metro_images"`, using the `importlib_resources` package). The - filenames roughly corresponds to the data in the `lines` column. The `fmt_image()` function can + filenames roughly corresponds to the data in the `lines` column. The `fmt_image()` method can be used with these inputs since the `path=` and `file_pattern=` arguments allow us to compose complete and valid file locations. What you get from this are sequences of images in the table cells, taken from the referenced graphics files on disk. diff --git a/great_tables/_helpers.py b/great_tables/_helpers.py index 9922db24b..8d62b4432 100644 --- a/great_tables/_helpers.py +++ b/great_tables/_helpers.py @@ -1,5 +1,6 @@ from __future__ import annotations + import random import string from typing import Any, Callable, Literal @@ -8,6 +9,11 @@ from ._text import Text +import re +from dataclasses import dataclass + +from great_tables._text import _md_html + FontStackName: TypeAlias = Literal[ "system-ui", "transitional", @@ -518,6 +524,501 @@ def _get_font_stack(name: FontStackName = "system-ui", add_emoji: bool = True) - return font_stack +def _generate_tokens_list(units_notation: str) -> list[str]: + + # Remove any surrounding double braces before splitting the string into a list of tokens + tokens_list = re.split(r"\s+", re.sub(r"^\{\{\s*|\s*\}\}$", "", units_notation)) + + # Remove any empty tokens (i.e., `None` or `""`) + tokens_list = [token for token in tokens_list if token != "" and token is not None] + + # Replace any instances of `/` with `^-1` + tokens_list = [ + re.sub(r"^/", "", x) + "^-1" if re.match(r"^/", x) and len(x) > 1 else x + for x in tokens_list + ] + + return tokens_list + + +@dataclass +class UnitDefinition: + token: str + unit: str + unit_subscript: str | None = None + exponent: str | None = None + sub_super_overstrike: bool = False + chemical_formula: bool = False + built: str | None = None + + @classmethod + def from_token(cls, token: str) -> UnitDefinition: + + unit_subscript = None + sub_super_overstrike = False + chemical_formula = False + exponent = None + + # Case: Chemical formula + # * e.g. "%C6H12O6%", where the '%' characters are used to denote a chemical formula + if re.match(r"^%.*%$", token) and len(token) > 2: + + chemical_formula = True + + # Extract the formula w/o the surrounding `%` signs + unit = re.sub(r"^%|%$", "", token) + + # Case: Subscript and exponent present inside square brackets, so overstriking required + # * e.g., 'm_[0^3]' + elif re.search(r".+?\[_.+?\^.+?\]", token): + + sub_super_overstrike = True + + # Extract the unit w/o subscript from the string + unit = re.sub(r"(.+?)\[_.+?\^.+?\]", r"\1", token) + + # Obtain only the subscript/exponent of the string + sub_exponent = re.sub(r".+?\[(_.+?\^.+?)\]", r"\1", token) + + # Extract the content after the underscore + unit_subscript = re.sub(r"^_(.+?)(\^.+?)$", r"\1", sub_exponent) + + # Extract the content after the caret + exponent = re.sub(r"_.+?\^(.+?)", r"\1", sub_exponent) + + # Case: Subscript and exponent present (overstriking is *not* required here) + # * e.g., 'm_2^3' + elif re.search(r".+?_.+?\^.+?", token): + + # Extract the unit w/o subscript from the string + unit = re.sub(r"^(.+?)_.+?\^.+?$", r"\1", token) + + # Obtain only the subscript/exponent portion of the string + sub_exponent = re.sub(r".+?(_.+?\^.+?)$", r"\1", token) + + # Extract the content after the underscore + unit_subscript = re.sub(r"^_(.+?)\^.+?$", r"\1", sub_exponent) + + # Extract the content after the caret + exponent = re.sub(r"^_.+?\^(.+?)$", r"\1", sub_exponent) + + # Case: Only an exponent is present + # * the previous cases handled the presence of a subscript and exponent, but this case + # only handles the presence of an exponent (indicated by the '^' character anywhere + # in the string) + # * e.g., 'm^2' + elif re.search(r"\^", token): + + # Extract the unit w/o exponent from the string + unit = re.sub(r"^(.+?)\^.+?$", r"\1", token) + + # Obtain only the exponent portion of the string + exponent = re.sub(r"^.+?\^(.+?)$", r"\1", token) + + # Case: Only a subscript is present + # * this case handles the presence of a single subscript (indicated by the '_' character + # anywhere in the string) + # * e.g., 'm_2' + elif re.search(r"_", token): + + # Extract the unit w/o subscript from the string + unit = re.sub(r"^(.+?)_.+?$", r"\1", token) + + # Obtain only the subscript portion of the string + unit_subscript = re.sub(r"^.+?_(.+?)$", r"\1", token) + else: + unit = token + + return cls(token, unit, unit_subscript, exponent, sub_super_overstrike, chemical_formula) + + def to_html(self): + units_str = "" + + units_object = self + + # Perform formatting of of the unit: + # * The `unit` attribute is the main part of the unit (e.g., 'm' in 'm^2') + # * The `unit` component should never be `None` + # * We take a simpler approach to formatting the unit when it only contains + # a single character (no use of `_units_symbol_replacements()` here) + if len(units_object.unit) > 1: + unit = _md_html( + _escape_html_tags( + _units_symbol_replacements(text=units_object.unit.replace("-", "−")) + ) + ) + + else: + unit = _md_html(units_object.unit.replace("-", "−")) + + # In the special case where the unit is 'x10', we replace the 'x' with a + # multiplication symbol: + # * This isn't done unit is a chemical formula since it's not necessary + # * This is practical for having scalar multipliers mixed in with units and typically + # this is raised to a power (e.g., 'x10^6') and often placed before the inline units + if "x10" in unit and not units_object.chemical_formula: + unit = unit.replace("x", "×") + + # Perform formatting of the exponent: + # * The `exponent` attribute is the exponent part of the unit (e.g., '2' in 'm^2') + # * The `exponent` component can be `None` if the unit does not have an exponent + # * When the `exponent` component is a string of length greater than 2, we also use + # `_units_symbol_replacements()` function to format the exponent) + if units_object.exponent is None: + exponent = None + + elif len(units_object.exponent) > 2: + exponent = _units_to_superscript( + _md_html( + _escape_html_tags( + _units_symbol_replacements( + text=units_object.exponent.replace("-", "−") + ) + ) + ) + ) + + else: + exponent = _units_to_superscript(content=units_object.exponent.replace("-", "−")) + + # Perform formatting of the unit subscript: + # * The `unit_subscript` attribute is the subscript part of the unit (e.g., '2' in + # 'm_2') + # * The `unit_subscript` component can be `None` if the unit does not have a subscript + # * When the `unit_subscript` component is a string of length greater than 2, we also + # use `_units_symbol_replacements()` function to format the subscript) + if units_object.unit_subscript is None: + unit_subscript = None + + elif len(units_object.unit_subscript) > 2: + unit_subscript = _units_to_subscript( + _md_html( + _escape_html_tags( + _units_symbol_replacements( + text=units_object.unit_subscript.replace("-", "−") + ) + ) + ) + ) + + else: + unit_subscript = _units_to_subscript( + content=units_object.unit_subscript.replace("-", "−") + ) + + units_str += unit + + # In the special case where the subscript and exponents are present and overstriking + # is required, we use the `_units_html_sub_super()` function to format the subscript + # and exponent: + # * The subscript and exponent are placed on top of each other, with left alignment + # * This bypasses the earlier formatting of the subscript and exponent + # * The result is placed to the right of the unit + if ( + units_object.sub_super_overstrike + and units_object.unit_subscript is not None + and units_object.exponent is not None + ): + + units_str += _units_html_sub_super( + content_sub=_md_html( + _escape_html_tags( + _units_symbol_replacements( + text=units_object.unit_subscript.replace("-", "−") + ) + ) + ), + content_sup=_md_html( + _escape_html_tags( + _units_symbol_replacements( + text=units_object.exponent.replace("-", "−") + ) + ) + ), + ) + + # In the special case where the unit is a chemical formula, we take the formatted unit + # and place all numbers (which are recognized now to be part of the chemical formula) + # into spans that are styled to be subscripts: + elif units_object.chemical_formula: + + units_str = re.sub( + "(\\d+)", + '\\1', + units_str, + ) + + else: + + if unit_subscript is not None: + units_str += unit_subscript + + if exponent is not None: + units_str += exponent + + return units_str + + +class UnitDefinitionList: + def __init__(self, units_list: list[UnitDefinition]): + self.units_list = units_list + + def __repr__(self) -> str: + return f"UnitDefinitionList({self.__dict__})" + + def __len__(self) -> int: + return len(self.units_list) + + def __getitem__(self, index: int) -> UnitDefinition: + return self.units_list[index] + + def to_html(self) -> str: + built_units = [unit_def.to_html() for unit_def in self.units_list] + + units_str = "" + + for unit_add in built_units: + + if re.search("\\($|\\[$", units_str) or re.search("^\\)|^\\]", unit_add): + spacer = "" + else: + spacer = " " + + if len(self) == 3 and self[1].unit == "/": + spacer = "" + + units_str += f"{spacer}{unit_add}" + + units_str = re.sub("^\\s+|\\s+$", "", units_str) + + return units_str + + def _repr_html_(self): + return self.to_html() + + +def _units_to_subscript(content: str) -> str: + return ( + '' + content + "" + ) + + +def _units_to_superscript(content: str) -> str: + return ( + '' + content + "" + ) + + +def _units_html_sub_super(content_sub: str, content_sup: str) -> str: + return ( + '' + + content_sup + + "
" + + content_sub + + "
" + ) + + +def _replace_units_symbol(text: str, detect: str, pattern: str, replace: str) -> str: + + if re.search(detect, text): + text = re.sub(pattern, replace, text) + + return text + + +def _units_symbol_replacements(text: str) -> str: + + # Replace certain units symbols with HTML entities; these are cases where the parsed + # text should be at the beginning of a string (or should be the entire string) + text = _replace_units_symbol(text, "^-", "^-", "−") + text = _replace_units_symbol(text, "^um$", "um", "µm") + text = _replace_units_symbol(text, "^uL$", "uL", "µL") + text = _replace_units_symbol(text, "^umol", "^umol", "µmol") + text = _replace_units_symbol(text, "^ug$", "ug", "µg") + text = _replace_units_symbol(text, "^ohm$", "ohm", "Ω") + + # Loop through the dictionary of units symbols and replace them with their HTML entities + for key, value in UNITS_SYMBOLS_HTML.items(): + text = _replace_units_symbol(text, key, key, value) + + return text + + +def _escape_html_tags(text: str) -> str: + + # Replace the '<' and '>' characters with their HTML entity equivalents + text = text.replace("<", "<") + text = text.replace(">", ">") + + return text + + +UNITS_SYMBOLS_HTML = { + "degC": "°C", + "degF": "°F", + ":pm:": "±", + ":mp:": "∓", + ":lt:": "<", + ":gt:": ">", + ":le:": "≤", + ":ge:": "≥", + ":cdot:": "⋅", + ":times:": "×", + ":div:": "÷", + ":ne:": "≠", + ":prime:": "′", + ":rightarrow:": "→", + ":leftarrow:": "←", + ":micro:": "µ", + ":ohm:": "Ω", + ":angstrom:": "Å", + ":times:": "×", + ":plusminus:": "±", + ":permil:": "‰", + ":permille:": "‰", + ":degree:": "°", + ":degrees:": "°", + ":space:": " ", + ":Alpha:": "Α", + ":alpha:": "α", + ":Beta:": "Β", + ":beta:": "β", + ":Gamma:": "Γ", + ":gamma:": "γ", + ":Delta:": "Δ", + ":delta:": "δ", + ":Epsilon:": "Ε", + ":epsilon:": "ε", + ":varepsilon:": "ϵ", + ":Zeta:": "Ζ", + ":zeta:": "ζ", + ":Eta:": "Η", + ":eta:": "η", + ":Theta:": "Θ", + ":theta:": "θ", + ":vartheta:": "ϑ", + ":Iota:": "Ι", + ":iota:": "ι", + ":Kappa:": "Κ", + ":kappa:": "κ", + ":Lambda:": "Λ", + ":lambda:": "λ", + ":Mu:": "Μ", + ":mu:": "μ", + ":Nu:": "Ν", + ":nu:": "ν", + ":Xi:": "Ξ", + ":xi:": "ξ", + ":Omicron:": "Ο", + ":omicron:": "ο", + ":Pi:": "Π", + ":pi:": "π", + ":Rho:": "Ρ", + ":rho:": "ρ", + ":Sigma:": "Σ", + ":sigma:": "σ", + ":sigmaf:": "ς", + ":varsigma:": "ς", + ":Tau:": "Τ", + ":tau:": "τ", + ":Upsilon:": "Υ", + ":upsilon:": "υ", + ":Phi:": "Φ", + ":phi:": "φ", + ":Chi:": "Χ", + ":chi:": "χ", + ":Psi:": "Ψ", + ":psi:": "ψ", + ":Omega:": "Ω", + ":omega:": "ω", +} + + +def define_units(units_notation: str) -> UnitDefinitionList: + """ + With `define_units()` you can work with a specially-crafted units notation string and emit the + units as HTML (with the `.to_html()` method). This function is useful as a standalone utility + and it powers the `fmt_units()` method in **Great Tables**. + + Parameters + ---------- + units_notation : str + A string of units notation. + + Returns + ------- + UnitDefinitionList + A list of unit definitions. + + Specification of units notation + ------------------------------- + + The following table demonstrates the various ways in which units can be specified in the + `units_notation` string and how the input is processed by the `define_units()` function. The + concluding step for display of the units in HTML is to use the `to_html()` method. + + ```{python} + #| echo: false + + from great_tables import GT, style, loc + import polars as pl + + units_tbl = pl.DataFrame( + { + "rule": [ + "'^' creates a superscript", + "'_' creates a subscript", + "subscripts and superscripts can be combined", + "use '[_subscript^superscript]' to create an overstrike", + "a '/' at the beginning adds the superscript '-1'", + "hyphen is transformed to minus sign when preceding a unit", + "'x' at the beginning is transformed to '×'", + "ASCII terms from biology/chemistry turned into terminology forms", + "can create italics with '*' or '_'; create bold text with '**' or '__'", + "special symbol set surrounded by colons", + "chemistry notation: '%C6H6%'", + ], + "input": [ + "m^2", + "h_0", + "h_0^3", + "h[_0^3]", + "/s", + "-h^2", + "x10^3 kg^2 m^-1", + "ug", + "*m*^**2**", + ":permille:C", + "g/L %C6H12O6%", + ], + } + ).with_columns(output=pl.col("input")) + + ( + GT(units_tbl) + .fmt_units(columns="output") + .tab_style( + style=style.text(font="courier"), + locations=loc.body(columns="input") + ) + ) + ``` + """ + + # Get a list of raw tokens + tokens_list = _generate_tokens_list(units_notation=units_notation) + + # Initialize a list to store the units + units_list = [] + + if len(tokens_list) == 0: + return UnitDefinitionList(units_list=[]) + + units_list = [UnitDefinition.from_token(token) for token in tokens_list] + return UnitDefinitionList(units_list=units_list) + + # This could probably be removed and nanoplot_options made into a dataclass # the built-in dataclass decorator doesn't do any validation / coercion, but # we could do that in the a __post_init__ hook. (I would switch it over to a diff --git a/great_tables/gt.py b/great_tables/gt.py index 21ebcedb4..81a5fd32d 100644 --- a/great_tables/gt.py +++ b/great_tables/gt.py @@ -26,6 +26,7 @@ fmt_roman, fmt_scientific, fmt_time, + fmt_units, ) from great_tables._heading import tab_header from great_tables._helpers import random_id @@ -222,6 +223,7 @@ def __init__( fmt_datetime = fmt_datetime fmt_markdown = fmt_markdown fmt_image = fmt_image + fmt_units = fmt_units fmt_nanoplot = fmt_nanoplot data_color = data_color diff --git a/tests/test_formats.py b/tests/test_formats.py index 6f6b4e7f8..b021c4964 100644 --- a/tests/test_formats.py +++ b/tests/test_formats.py @@ -1489,6 +1489,97 @@ def test_fmt_image_path(): assert 'src="/a/b/c"' in strip_windows_drive(res) +@pytest.mark.parametrize( + "src,dst", + [ + # 1. unit with superscript + ("m^2", 'm2'), + # 2. unit with subscript + ("h_0", 'h0'), + # 3. unit with superscript and subscript + ( + "h_0^3", + 'h03', + ), + # 4. unit with superscript and subscript (using overstriking) + ( + "h[_0^3]", + 'h3
0
', + ), + # 5. slashed-unit shorthand for a '-1' exponent + ( + "/s", + 's−1', + ), + # 6. slashes between units normalized + ( + "t_0 / t_n", + 't0/tn', + ), + # 7. multiple inline units, separating by a space + ( + "kg^2 m^-1", + 'kg2 m−1', + ), + # 8. use of a number allowed with previous rules + ( + "10^3 kg^2 m^-1", + '103 kg2 m−1', + ), + # 9. use of 'x' preceding number to form scalar multiplier + ( + "x10^3 kg^2 m^-1", + '×103 kg2 m−1', + ), + # 10. hyphen is transformed to minus sign when preceding a unit + ( + "-h^2", + '−h2', + ), + # 11. italicization of base unit + ( + "*m*^2", + 'm2', + ), + # 12. emboldening of base unit + ( + "**m**^2", + 'm2', + ), + # 13. italicizing and emboldening of base unit + ( + "_**m**_^2", + 'm2', + ), + # 14. styling of subscripts and superscripts + ( + "h_*0*^**3**", + 'h03', + ), + # 15. transformation of common units from ASCII to preferred form + ("ug", "µg"), + # 16. insertion of common symbols and Greek letters via `:[symbol name]:` + (":angstrom:", "Å"), + # 17. use of chemical formulas via `%[chemical formula]%` + ( + "%C6H12O6%", + 'C6H12O6', + ), + # 18. Any '<' and '>' characters from input are escaped to prevent HTML rendering as tags + ( + "m^2 s_0", + 'm2 <tag> s0', + ), + ], +) +def test_fmt_units(src: str, dst: str): + + units_tbl = pl.DataFrame({"units": [src]}) + gt_tbl = GT(units_tbl).fmt_units(columns="units") + + assert dst == _get_column_of_values(gt_tbl, column_name="units", context="html")[0] + + # ------------------------------------------------------------------------------ # Test `fmt_nanoplot()` # ------------------------------------------------------------------------------ diff --git a/tests/test_helpers.py b/tests/test_helpers.py index b945f03ce..176fdc8a0 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,4 +1,19 @@ -from great_tables._helpers import LETTERS, letters, pct, px, random_id, _get_font_stack, FONT_STACKS +from great_tables._helpers import ( + LETTERS, + letters, + pct, + px, + random_id, + _get_font_stack, + define_units, + FONT_STACKS, + _generate_tokens_list, + _units_to_subscript, + _units_to_superscript, + _units_html_sub_super, + _replace_units_symbol, + _units_symbol_replacements, +) import pytest @@ -81,3 +96,185 @@ def test_get_font_stack_add_emoji_true(font_stack_names): assert all( extended_emoji.issubset(_get_font_stack(name, add_emoji=True)) for name in font_stack_names ) + + +@pytest.mark.parametrize( + "units, x_out", + [ + ("m^2", ["m^2"]), + ("m s^-1", ["m", "s^-1"]), + ], +) +def assert_generate_tokens_list(units: str, x_out: str): + + x = _generate_tokens_list(units_notation=units) + assert x == x_out + + +@pytest.mark.parametrize( + "content, x_out", + [ + ("2"), + (""), + ], +) +def assert_units_to_subscript(content: str): + x = _units_to_subscript(content=content) + assert ( + x == f'{content_sup}
{content_sub}
' + ) + + +@pytest.mark.parametrize( + "text, detect, pattern, replace, x_out", + [ + ("-10^-5", "^-", "^-", "−", "&minus10^-5"), + ("uL", "^uL$", "uL", "µL", "µL"), + ("umol_0", "^umol", "^umol", "µmol", "µmol_0"), + ], +) +def assert_replace_units_symbol(text: str, detect: str, pattern: str, replace: str, x_out: str): + x = _replace_units_symbol(text=text, detect=detect, pattern=pattern, replace=replace) + assert x == x_out + + +@pytest.mark.parametrize( + "text, x_out", + [ + ("um", "µm"), + (":Omicron:", "Ο"), + ], +) +def assert_units_symbol_replacements(text: str, x_out: str): + x = _units_symbol_replacements(text=text) + assert x == x_out + + +@pytest.mark.parametrize( + "units_notation, x_out", + [ + # unit with superscript + ( + "m^2", + 'm2', + ), + # unit with subscript + ( + "h_0", + 'h0', + ), + # unit with superscript and subscript + ( + "h_0^3", + 'h03', + ), + # unit with superscript and subscript (using overstriking) + ( + "h[_0^3]", + 'h3
0
', + ), + # slashed-unit shorthand for a '-1' exponent + ( + "/s", + 's−1', + ), + # slashes between units normalized + ( + "t_0 / t_n", + 't0/tn', + ), + # multiple inline units, separating by a space + ( + "kg^2 m^-1", + 'kg2 m−1', + ), + # use of a number allowed with previous rules + ( + "10^3 kg^2 m^-1", + '103 kg2 m−1', + ), + # "use of 'x' preceding number to form scalar multiplier + ( + "x10^3 kg^2 m^-1", + '×103 kg2 m−1', + ), + # hyphen is transformed to minus sign when preceding a unit + ( + "-h^2", + '−h2', + ), + # italicization of base unit + ( + "*m*^2", + 'm2', + ), + # emboldening of base unit + ( + "**m**^2", + 'm2', + ), + # italicizing and emboldening of base unit + ( + "_**m**_^2", + 'm2', + ), + # styling of subscripts and superscripts + ( + "h_*0*^**3**", + 'h03', + ), + # transformation of common units from ASCII to preferred form + ( + "ug", + "µg", + ), + # insertion of common symbols and Greek letters via `:[symbol name]:` + ( + ":angstrom:", + "Å", + ), + # use of chemical formulas via `%[chemical formula]%` + ( + "%C6H12O6%", + 'C6H12O6', + ), + ], +) +def assert_define_units_html_superscript(): + x = define_units(units_notation="m^2").to_html() + assert x == 'm2'