-
Notifications
You must be signed in to change notification settings - Fork 0
/
mathAlphanumerics.py
executable file
·1626 lines (1404 loc) · 65.8 KB
/
mathAlphanumerics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
#
# mathAlphanumerics.py: Map Latin, Greek, and digits to special math variants,
# such as bold, italic, fraktur, etc..
# Written <2006-10-04, Steven J. DeRose.
#
#pylint: disable=W0603
#
import sys
import re
import unicodedata
import string
from typing import List, Dict
__metadata__ = {
"title" : "mathAlphanumerics",
"description" : "Map Latin, Greek, and digits to special math variants.",
"rightsHolder" : "Steven J. DeRose",
"creator" : "http://viaf.org/viaf/50334488",
"type" : "http://purl.org/dc/dcmitype/Software",
"language" : "Python 3.7",
"created" : "<2006-10-04",
"modified" : "2023-12-01",
"publisher" : "http://github.com/sderose",
"license" : "https://creativecommons.org/licenses/by-sa/3.0/"
}
__version__ = __metadata__["modified"]
descr = """
=Description=
Provide support for using the many Unicode font-like variations on the
Latin and Greek alphabets and digits,
either as a command-line filter or via an API.
==Command line usage==
To see samples of all the available variations, use:
mathAlphanumerics.py --script Latin --test
The alternatives for `--script` are "Greek" and "Digits". The sample text is
chosen randomly from a set of pangrams (phrases that use all the letters).
The sample set defaults to Enlish; you can instead specify
`--language [Greek|Latin]]`, or your own text with `--sample [text]`.
N.B.: My `ord --math` displays a similar list and samples (`ord` is for finding
detailed information about Unicode characters, finding them based on their
properties).
To pipe Unicode text through `mathAlphanumerics.py``, use options to select
the desired script
and variation (called `--font` because that's how it's being used here; `--font`
values ignore case):
cat eggs.txt | mathAlphanumerics.py --script Latin --font 'BOLD ITALIC'
To convert text on the command line rather than using stdin (for example,
to prepare a message to copy into a Web form that doesn't support markup),
set the messages as the `--sample` text:
mathAlphanumerics.py --font 'FRAKTUR' --sample "Spam and eggs"
To handle accented characters, specify `--decompose` and the diacritics will
be separated first so that the base characters are converted. This will
not, however, make the diacritics themselves bold or italic or fraktur, etc.
In some cases, the font used by your system may not space some variants
correctly. For example, I see Fraktur, Circledm Squared, and Parenthesized
squeezed together. The workaround --spread will insert alternating spaces
to accommodate this.
==Library usage==
To call the package from Python:
import mathAlphanumerics
s2 = mathAlphanumerics.convert(text,
script="Latin", font="Mathematical Bold", decompose=True)
To generate the corresponding translation table separately:
xtab = mathAlphanumerics.getTranslateTable(
"Latin", "Mathematical Sans-serif Bold Italic")
s = s.translate(xtab)
See the "Methods" section below for more details.
The sample sentences are available in MathAlphanumerics.pangrams,
a dict keyed by language name.
==Available "scripts" and "fonts"==
The `--script` choices are "Latin" (the default), "Greek", or "Digits".
Each has a different selection of available "font" variations).
To see a list of the "fonts" available for a "script"`
(add -v to include samples and the code point where each starts):
mathAlphanumerics.py --script Greek --show
The `--font` options (the default is "ITALIC") are:
For Latin:
"BOLD" UPPER LOWER DIGITS
"ITALIC" UPPER LOWER
"BOLD ITALIC" UPPER LOWER
"SANS-SERIF" UPPER LOWER DIGITS
"SANS-SERIF BOLD" UPPER LOWER DIGITS
"SANS-SERIF ITALIC" UPPER LOWER
"SANS-SERIF BOLD ITALIC" UPPER LOWER
"SCRIPT" UPPER LOWER
"BOLD SCRIPT" UPPER LOWER
"FRAKTUR" UPPER LOWER
"BOLD FRAKTUR" UPPER LOWER
"DOUBLE-STRUCK" UPPER LOWER DIGITS
"MONOSPACE" UPPER LOWER DIGITS
"CIRCLED" UPPER LOWER DIGITS
"PARENTHESIZED" UPPER LOWER DIGITS
"FULLWIDTH" UPPER LOWER DIGITS
"SQUARED" UPPER
"NEGATIVE SQUARED" UPPER
"REGIONAL INDICATOR SYMBOL" UPPER (???)
"NEGATIVE CIRCLED" UPPER
"SUPERSCRIPT" DIGITS (alphabet unfinished)
"SUBSCRIPT" DIGITS (alphabet unfinished)
For Greek:
"BOLD" UPPER LOWER
"ITALIC" UPPER LOWER
"BOLD ITALIC" UPPER LOWER
"SANS-SERIF BOLD" UPPER LOWER
"SANS-SERIF BOLD ITALIC" UPPER LOWER
For Digits, many additional sets are available, mainly for a variety of orthographies.
In addition to those already listed (I cannot personally evaluate the results
for most of these; error reports are welcome):
"DIGIT COMMA"
"DIGIT FULL STOP"
"ARABIC-INDIC"
"EXTENDED ARABIC-INDIC"
"NKO"
"DEVANAGARI"
"BENGALI"
"GURMUKHI"
"GUJARATI"
"ORIYA"
"TAMIL"
"TELUGU"
"KANNADA"
"MALAYALAM"
"SINHALA LITH"
"THAI"
"LAO"
"TIBETAN"
"MYANMAR"
"MYANMAR SHAN"
"KHMER"
"MONGOLIAN"
"LIMBU"
"NEW TAI LUE"
"TAI THAM HORA"
"TAI THAM THAM"
"BALINESE"
"SUNDANESE"
"LEPCHA"
"OL CHIKI"
"IDEOGRAPHIC NUMBER"
"VAI"
"SAURASHTRA"
"COMBINING DEVANAGARI"
"KAYAH LI"
"JAVANESE"
"CHAM"
"MEETEI MAYEK"
Some additional sets of digits are available except for ZERO:
"CIRCLED"
"DINGBAT NEGATIVE CIRCLED"
"DOUBLE CIRCLED"
"PARENTHESIZED"
"FULL STOP"
"DINGBAT CIRCLED SANS-SERIF"
"DINGBAT NEGATIVE CIRCLED SANS-SERIF"
I expect to add other special "effects", but some might not be supported in the "translate table" method:
"TURNED" (aka ROTATED)
"STRIKETHROUGH"
"UNDERLINE"
"OVERLINE"
=Cautions=
* This will only work if your display medium supports Unicode,
and the exact results depend on the font(s) in use.
*When a mapping is not available, the character is left unchanged.
* Some of the "fonts" are available only in uppercase, or lack digits.
* Some of the digit sets lack zero, as noted in the list above.
* Some fonts may not include all these characters.
* Some fonts may not be aesthetically consistent for all these sets.
As an example, the role of MATHEMATICAL ITALIC SMALL H is filled by
PLANCK CONSTANT, which was added to Unicode much earlier than the rest
of that set. Some font designers might not have co-ordinated it's exact
size, stroke weight, alignment, or other characteristics with the rest
of the MATHEMATICAL ITALIC characters.
* Accented characters are only supported if decomposed. See the next
section for more detail on that.
*This should probably not be used for output destined for an audio screen-reader.
==Accented and other characters==
This program only translates the unaccented basic letters and digits.
Use Unicode `canonical decomposition` first if desired, then modify the base characters:
import unicodedata
s = unicodedata.normalize('NFD', myString).translate(xtab)
The result should be reasonable, if imperfect. For example,
diacritic placement could conflicting with an enclosing circle,
or be off center for italics. And the diacritic itself will not be bold, etc.
Re-composing the result is probably pointless, because few if any of the MATHEMATICAL and
other visually special characters come in accented versions anyway.
''Note:'' Throughout this package, "font" does not refer to typographic fonts
per se, but to Unicode's sets of variations (mostly intended for special
uses in mathematics),
such as "MATHEMATICAL SANS-SERIF BOLD ITALIC". The names used here are taken
directly from the Unicode character names, except that "MATHEMATICAL" may be
omitted and case is ignored.
=Methods=
Methods in this package are called statically.
==getFontDict(script="Latin")==
Return `LatinFontDict`, `GreekFontDict`, or `DigitsFontDict`, as appropriate.
See their descriptions below.
==getStartCodePoint(script="Latin", font="BOLD", group="U")==
Return the code point of the first character for the given "script" in
the given "font" variation (such as "SANS-SERIF BOLD", etc.). By default, gets
the position of A for Latin, alpha for Greek, or Digit for 0). Pass `group`
as "U", "L", or "D" respectively, to get the position of the first uppercase,
lowercase, or digit for "fonts" as needed.
Typically, the uppercase range is first, immediately followed by the lowercase
range, and the digits are elsewhere.
The "starting" code point is reported as where the "font" ''would'' begin if
it were all there. Thus, the "PARENTHESIZED" Latin digits are listed as
beginning at U+02473 even though U+2473 is actually "CIRCLED NUMBER TWENTY".
But the script knows there is no "PARENTHESIZED DIGIT ZERO", and will
leave "0" untranslated in this case.
==convert(s, script="Latin", font="Bold")==
Just convert characters in the string `s` that are from the given "script" to the specified "font".
For "Latin", uppercase, lowercase, and digits are converted (when available).
For "Greek", uppercase and lowercase are converted (when available).
For "Digits", only digits 0-9 are converted (and only when available).
==getTranslateTable(script="Latin", font="BOLD")==
Return a Python 3 translation table generated for the specified "script"
and "font". Exceptions are integrated, and omissions are omitted.
==makePartialXtab(srcStart, srcEnd, tgtStart)==
=Related Commands=
My `ord --math` displays a list of these character variations, with samples.
My `bitmapSpell.py` maps ASCII text to 5x7 ASCII art letters.
=Known bugs and Limitations=
"font" names are not recognized with re-ordered tokens. For example,
"BOLD FRAKTUR" works (regardless of case), but not "FRAKTUR BOLD".
Some of the sets have only upper or only lower case, or are missing
some specific characters. Fraktur, in particular, does not include
the traditional long s, "beta", and umlauted vowels (though you can
apply Unicode COMBINING DIAERESIS (U+00308).
With `--makeHtmlComparison`, the generated HTML includes a few custom elements,
with styles applied via CSS in <head>:
sans { font-family:sans-serif; }
serif { font-family:serif; }
cursive { font-family:cursive; }
monospace { font-family:monospace; }
Modern browsers are fine with this, but it's not precisely "HTML". If that's a
problem, change them to something like '<span class="sans">', etc. Or change the
DOCTYPE to reference a schema that adds them.
The `--family` choice (if any) is applied to <body>, so these will override it for
the appropriate cases (I don't know what happens, for example, if you specify
a cursive font for --family, and then the SCRIPT row applies cursive on top of it --
perhaps it notices it's already cursive and keeps the active one, or perhaps it does
its own search and gives you the first one it finds.
==Script-specific issues==
"REGIONAL INDICATOR SYMBOL" is odd but included -- I gather these 26 are intended
to enable ISO 3166-1 alpha-2 two-letter country codes to be displayed as the
corresponding flags (of course not all pairs are assigned).
This program cannot translate multiple scripts simultaneously.
Punctuation is not yet supported, such as superscript and subscript
parentheses, plus, minus, etc. For many of the (pseudo-) fonts it might not
make sense anyway; but for some it does.
Superscript, subscript, turned, and strikethrough are not finished.
Note that the superscript and subscript characters commonly do not look and align
the same as browser-rendered HTML <sup> and <sub>
(cf [https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts])
Also, Unicode superscripts provide only two Latin letters (i and n), although most
can be found as similar combining marks (apparently meant for representing German
medieval practice):
U+00363 COMBINING LATIN SMALL LETTER A ok
U+01de8 COMBINING LATIN SMALL LETTER B (missing in my test font)
U+00368 COMBINING LATIN SMALL LETTER C ok
U+00369 COMBINING LATIN SMALL LETTER D ok
U+00364 COMBINING LATIN SMALL LETTER E ok
U+01deb COMBINING LATIN SMALL LETTER F (missing in my test font)
U+01dda COMBINING LATIN SMALL LETTER G (missing in my test font) (uc at U+01ddb)
U+0036a COMBINING LATIN SMALL LETTER H ok
U+00365 COMBINING LATIN SMALL LETTER I ok
*** J?
U+01ddc COMBINING LATIN SMALL LETTER K ok
U+01ddd COMBINING LATIN SMALL LETTER L ok (uc at U+01DDE)
U+0036b COMBINING LATIN SMALL LETTER M ok (uc at U+01DDF)
U+01de0 COMBINING LATIN SMALL LETTER N ok (uc at U+01DE1)
U+00366 COMBINING LATIN SMALL LETTER O ok
U+01dee COMBINING LATIN SMALL LETTER P (missing in my test font)
*** Q?
U+0036c COMBINING LATIN SMALL LETTER R ok (uc at U+01DE2)
U+01de4 COMBINING LATIN SMALL LETTER S ok
U+0036d COMBINING LATIN SMALL LETTER T ok
U+00367 COMBINING LATIN SMALL LETTER U ok
U+0036e COMBINING LATIN SMALL LETTER V ok
U+01df1 COMBINING LATIN SMALL LETTER W (missing in my test font)
U+0036f COMBINING LATIN SMALL LETTER X ok
*** Y?
U+01de6 COMBINING LATIN SMALL LETTER Z ok
Non-Latin digit series are not well integrated or tested.
Numbers > 9 (such as for roman numerals, circled numbers, etc.) are not supported.
"Modifier letters" are not supported.
This package does not provide a way to translate the various alternate
sets back to plain Latin or Greek or Digits. However, this can be done
pretty well with Unicode "compatibility decomposition":
import unicodedata
s = unicodedata.normalize("NFKD", s)
[https://www.unicode.org/reports/tr25/tr25-6.html#_Toc2] notes that the
Mathematical Greek sets include several less-used characters,
such as uppercase nabla (U+2207) and a variant of theta (U+03F4);
and lowercase partial differential sign (U+2202) and glyph variants of
epsilon (U+03F5), theta (U+03D1), kappa (U+03F0),
phi (U+03D5), rho (U+03F1), and pi (U+03D6).
They are not supported here (yet).
==Issues outside the script's control==
If the display environment doesn't handle Unicode, or the font in use has
problems with any of the characters needed, the result may not be ideal.
Unicode intends most of these characters for special mathematical uses, such
as ensuring that you get the fancy "R" (U+0211d) needed to refer to
the set of real numbers (a symbol is needed because it takes too long to
write the full set of real numbers on a blackboard). Using
these characters for formatting is a little weird. But Gæð a wyrd swa hio scel.
Sets such as MATHEMATICAL ITALIC are generally defined in Unicode as a contiguous
range, but occasionally one or a few members are somewhere else
(such as MATHEMATICAL ITALIC SMALL H), and the "expected" slot among
the rest of the letter is left undefined. In practice, this also means that
Unicode fonts do not always define quite the same "look" to those characters.
It strikes me that leaving those slots blank is mainly useful because it
slightly simplifies programs like this: ones that want to translate the entire
block rather than particular characters like the "R" for real numbers.
Monospace fonts for Unicode may not always display with all the characters the
same width. "FULLWIDTH" may pose similar problems.
Also, "MONOSPACE" is not very distinctive on terminals that always use monospace
anyway.
"SANS-SERIF" is not very distinctive if your default font is that way.
"SCRIPT" may look a lot like italic, especially to the unpracticed eye.
The implementation here knows where
the ranges start, and then has a list of "exceptions" (in a class variable
of that name). There is an edge case when the ''first'' character of the
range does not exist, or exists but is exceptional. In such cases, the code
treats the range as beginning at where the first character *would* be if it
were not missing or an exception. This ''may'' lead to problems if your data
contains that character.
For example, PARENTHESIZED DIGIT ONE is U+2474, so we would expect the ZERO
at U+2473. But U+2473 is CIRCLED NUMBER TWENTY. This package takes the
exceptions into account when building a translation table. When the character
is entirely missing (as in this example), the "basic" character (in this
case "1") is left unchanged (it is not even added to the translation table).
Characters with diacritics must be decomposed first, because the specialized
"fonts" do not generally include composed characters. To decompose
(that is, to split off the diacritic to a separate overstruck character),
do this in Python:
import unicodedata
s = unicodedata.normalize("NFD", s)
=Notes=
Using Unicode Mathematical characters to achieve formatting may be slightly odd,
but alternatives are limited given current terminal and shell technology,
which typically support Unicode but not font-changes or
effects (other than ANSI terminal color and effect escapes).
Some additional scripts and forms are not supported:
Some squared CJK
Parenthesized ideographs, Hangul, and Korean characters.
Circled italic latin (missing C, R)
Double-Struck Italic (missing D, e, i, j)
Squared Latin (only Small Letter D?)
Old Italic letters at U+10300 and following
circled katakana U+032d0
some circled hangul and ideographs U+3260
circled number X on black square U+3248
aeox schwa hklmnpst; i=1d62, r=1d63, u=1d64, v=1d65, j=2c7c
[ "subscript latin upper (...209c)", 0x02090 ],
and suppl for rest of lc latin except jqy (seriously???)
spacing modifier letters a few
phonetic extensions has some latin/cyr/ipa
2145-2149 double struck italics???
213c-40 double struck pi, gamma, sigma
(couple extras at 1d6a4, dotless i, j)
213c-40 double struck pi, gamma, sigma
=History=
# Written (originally in Perl) sometime before 2006-10-04, by Steven J. DeRose.
* 2008-02-11 sjd: Add `--perl`, `perl -w`.
* 2008-09-03 sjd: BSD. Improve doc, error-checking, fix bug in `-all`.
* 2010-03-28 sjd: perldoc. Add [] to `-ps`.
* 2010-09-20ff sjd: Cleanup. Add `--color`; ls and dircolors support. Simplify
numeric handling of codes. Support color combinations. Add `-setenv`.
Change 'fg2_' prefix to 'bold_' and factor out of code.
* 2013-06-11: Add `--xterm256`, but just for `--list`.
* 2013-06-27: Add `--table`. Ditch "fg2_" and "b_" prefixes.
* 2014-07-09: Clean up doc. Add `--python`. Clean up `--perl`. fix `--list`.
* 2015-02-04: Support rest of effects beyond bold.
* 2015-08-25: Start syncing color-refs with sjdUtils.pm.
* 2016-01-01: Get rid of extraneous final newline with `-m`.
* 2016-07-21: Merge doc on color names w/ sjdUtils.p[my], etc.
* 2016-10-25: Clean up to integrate w/ ColorManager. Change names.
Debug new (hashless) way of doing colors.
* 2018-08-29ff: Port to Python. Split from Perl colorstring.
* 2018-09-04: Merged from incomplete `UnicodeAltLatin.py`
* 2020-07-25: Lose remaining upper/lower separations. Big cleanup.
Support complete upper/lower/digit translation tables. Add `--test`.
Add support for in-pipe translation.
* 2020-09-03: Improve translation-table construction.
* 2021-02-18: Fix bug that dropped part of translate tables.
* 2021-02-23: Add option for Unicode normalization.
* 2021-12-20: Add --makeHtmlComparison. Add type hints.
* 2022-01-07: Add SMALLCAP, SUBSCRIPT, SUPERSCRIPT, ROTATED, UNDERLINE, DUNDERLINE,
OVERLINE, DOVERLINE, STRIKE, SLASHED, DSLASHED.
* 2023-03-08: Clean up sample generation, proof help.
=To do=
* Cleaner way to request, like splitting out bold and italic -- though
you only get the full set of { roman, bold, italic, bold-italic } for
plain and sans-serif.
* Add method to turn any of the Mathematical ones to plain.
* Add support for small capitals (Latin 'X' is not defined; only a few Greek
and one Cyrillic).
* Perhaps add feature to turn markup into fonts -- such as
<i> ITALIC
<b> BOLD
<tt> MONOSPACE
SANS SERIF
<u>
<strike>
<em>
<strong>
<big>
<sub> SUBSCRIPT
<sup> SUPERSCRIPT
But what of FRAKTUR, DOUBLE-STRUCK, SCRIPT, FULLWIDTH, and the enclosed ones? <span?>
* Better testing for non-Latin digits.
* Possibly add combining characters such as underscore, strike-through, and
overline. There's even COMBINING ENCLOSING CIRCLE (and SQUARE).
* Greek has some turned letters, too:
"a": chr(0x00252), # A chr(0x2C6F)
"b": #
"g": #
"d": chr(0x0018D), # D chr(0x2207) nabla
"e": chr(0x01D08),
"z": # =Zeta
"h": # =Eta
"q": # =Theta
"i": chr(0x02129), # Iota
"k": chr(0x0029e), # chr(0x0029e) LATIN SMALL LETTER TURNED K
"l": # V
"m": chr(0x0019c), # LATIN CAPITAL LETTER TURNED M
"n": chr(0x0028c), #LATIN SMALL LETTER TURNED V
"c": #
"o": # =Omicron
"p": #
"r": # ~~~d~~~
"s": #
"t": # =Tau
"u": # =Upsilon
"f": phi # =Phi
"x": xi # =Xi
"ps": #
"w": #
* Quarter-turned instead of half?
"q": # q CCW
"q": # m CW chr(0x01d1f)
* Possibly add turned (cf reversed), using:
Problem: there isn't a reserved range for these
--- uppercase ---
A U+02c6f LATIN CAPITAL LETTER TURNED A
B ???
C ???
D ???
E => exists U+02c7b(smallcap)
F U+02132 TURNED CAPITAL F
G U+02141 TURNED SANS-SERIF CAPITAL G
H U+0a78d LATIN CAPITAL LETTER TURNED H (why is this not itself?)
I => I
J ? ???
K ? ???
L => U+0a780 LATIN CAPITAL LETTER TURNED L
#U+02142 TURNED SANS-SERIF CAPITAL L
M => U+0019c LATIN CAPITAL LETTER TURNED M
N => N
O => O
P => ~~~d~~~ ??? eth?
Q => ???
R => ??? U+01d1a Latin Letter Small Capital Turned R
S => S
T => ???
U => U+2229 intersection?
V => U+00245 LATIN CAPITAL LETTER TURNED V
W => ~~~M~~~ ???
X => X
Y => U+02144 TURNED SANS-SERIF CAPITAL Y
Z => Z ???
--- lowercase ---
a => U+00250 LATIN SMALL LETTER TURNED A
b => q
c => U+02184 LATIN SMALL LETTER REVERSED C
d => p
e => U+001dd LATIN SMALL LETTER TURNED E (or e => schwa)
f => U+0214e TURNED SMALL F
g => U+01d77 LATIN SMALL LETTER TURNED G
h => U+00265 LATIN SMALL LETTER TURNED H
i => U+01d09 LATIN SMALL LETTER TURNED I
j => medial s?
k => U+0029e LATIN SMALL LETTER TURNED K
l => U+0a781 LATIN SMALL LETTER TURNED L
m => U+0026f LATIN SMALL LETTER TURNED M
n => u
o = o
p => d
q => b
r => U+00279 LATIN SMALL LETTER TURNED R
s = s
t => U+00287 LATIN SMALL LETTER TURNED T
u => n
v => U+0028c LATIN SMALL LETTER TURNED V
w => U+0028d LATIN SMALL LETTER TURNED W
x = x
y => U+0028e LATIN SMALL LETTER TURNED Y
z = z
* Possibly add reversed (horizontally)
--- uppercase ---
"A": "A",
"B": ???
"C": chr(0x02183), # ROMAN NUMERAL REVERSED ONE HUNDRED
"D":
"E": chr(0x0018e), # LATIN CAPITAL LETTER REVERSED E
"F":
"G":
"H": "H",
"I": "I",
"J":
"K":
"L": chr(0x02143), # REVERSED SANS-SERIF CAPITAL L
"M": "M",
"N":
"O": "O",
"P": chr(0x0a7fc), # LATIN EPIGRAPHIC LETTER REVERSED P
"Q":
"R":
"S": # https://en.wikipedia.org/wiki/%C6%A7
"T": "T",
"U": "U",
"V": "V",
"W": "W",
"X": "X",
"Y": "Y",
"Z":
--- lowercase ---
"a":
"b": "d",
"c": chr(0x02184), # LATIN SMALL LETTER REVERSED C
"d": "b",
"e": chr(0x00258), # LATIN SMALL LETTER REVERSED E
"f":
"g":
"h":
"i": "i",
"j":
"k":
"l": "l",
"m": m?
"n": n?
"o": "o",
"p": "q",
"q": "p",
"r": chr(0x0027f), # LATIN SMALL LETTER REVERSED R WITH FISHHOOK
"s":
"t":
"u": u?
"v": "v",
"w": "w",
"x": "x",
"y":
"z":
* Hook up smallCapMap (missing Q and X)
* Possibly add superscript
"i": chr(0x02071), # SUPERSCRIPT LATIN SMALL LETTER I
"n": chr(0x0207f), # SUPERSCRIPT LATIN SMALL LETTER N
* Non-alphanumeric variants: punctuation, esp. for superscript and subscript
=Rights=
Copyright 2006 by Steven J. DeRose. This work is licensed under a
Creative Commons Attribution-Share-alike 3.0 unported license.
For further information on this license, see
[https://creativecommons.org/licenses/by-sa/3.0].
For the most recent version, see [http://www.derose.net/steve/utilities]
or [https://github.com/sderose].
=Options=
"""
###############################################################################
#
def warning(msg):
sys.stderr.write(msg + "\n")
###############################################################################
# Support Unicode alternate forms of Latin, Greek, and digits.
# See also bin/data/unicodeLatinAlphabets.py
#
class mathAlphanumerics:
oneHotFeatures = [
"Fullwidth", "Script", "Fraktur", "Double-Struck", "Sans-serif",
"Monospace", "Parenthesized", "Circled", "Squared",
"Negative-circled", "Negative-squared", "Regional symbol"
]
# These are the "fonts" available as Unicode "Mathematical" variations on
# Latin. A similar list is available for Greek, and for digits.
# NOTE: "MATHEMATICAL" is omitted for compactness.
#
LatinFontDict = {
####### Following are "Mathematical":
# Name ( Upper Lower, Digits Exceptions )
"BOLD": ( 0x1d400, 0x1d41a, 0x1d7ce, "" ),
"ITALIC": ( 0x1d434, 0x1d44e, None, "h" ),
"BOLD ITALIC": ( 0x1d468, 0x1d482, None, "" ),
"SANS-SERIF": ( 0x1d5a0, 0x1d5ba, 0x1d7e2, "" ),
"SANS-SERIF BOLD": ( 0x1d5d4, 0x1d5ee, 0x1d7ec, "" ),
"SANS-SERIF ITALIC": ( 0x1d608, 0x1d622, None, "" ),
"SANS-SERIF BOLD ITALIC": ( 0x1d63c, 0x1d656, None, "" ),
"SCRIPT": ( 0x1d49c, 0x1d4b6, None, "BEFHILMR ego"),
"BOLD SCRIPT": ( 0x1d4d0, 0x1d4ea, None, "" ),
"FRAKTUR": ( 0x1d504, 0x1d51e, None, "CHIRZ" ),
"BOLD FRAKTUR": ( 0x1d56c, 0x1d586, None, "" ),
"DOUBLE-STRUCK": ( 0x1d538, 0x1d552, 0x1d7d8, "CHNPQRZ" ),
"MONOSPACE": ( 0x1d670, 0x1d68a, 0x1d7f6, "" ),
####### Following aren't "Mathematical":
"CIRCLED": ( 0x024b6, 0x024d0, 0x0245f, "" ),
"PARENTHESIZED": ( 0x1f110, 0x0249c, 0x02473, "" ),
"FULLWIDTH": ( 0x0FF21, 0x0FF41, 0x0FF10, "" ),
####### Not available in lower case:
"SQUARED": ( 0x1f130, None, None, "" ),
"NEGATIVE SQUARED": ( 0x1f170, None, None, "" ),
"REGIONAL INDICATOR SYMBOL": ( 0x1f1e6, None, None, "" ), # ???
"NEGATIVE CIRCLED": ( 0x1f150, None, None, "" ), # 0x02775 ???
"SUPERSCRIPT": ( None, None, 0x02070, "123" ),
"SUBSCRIPT": ( None, None, 0x02080, "" ),
####### Unfinished:
#"Subscript Latin Small" : [], # aehijklmnoprstuvx
}
GreekFontDict = {
"BOLD": ( 0X1D6A8, 0X1D6C2, None, "" ),
"ITALIC": ( 0X1D6E2, 0X1D6FC, None, "" ),
"BOLD ITALIC": ( 0X1D71C, 0X1D736, None, "" ),
"SANS-SERIF BOLD": ( 0X1D756, 0X1D770, None, "" ),
# No Mathematical Greek Sans Serif Italic, apparently?
"SANS-SERIF BOLD ITALIC": ( 0X1D790, 0X1D7AA, None, "" ),
########## Unfinished:
#"SUPERSCRIPT GREEK SMALL": (),
#"SUBSCRIPT GREEK SMALL": (),
}
# Some of these sets lack a zero. in those cases the set is listed as
# beginning where the zero *would* be naturally -- just before the 1.
# TODO: Delete ones redundant with Latin list above
#
DigitsFontDict = {
# [ NAME UC LC DIGITS exceptions ]
# These are covered above:
"BOLD": [ None, None, 0x1d7Ce, "" ],
# no italic or bold italic
"SANS SERIF": [ None, None, 0x1d7e2, "" ],
"SANS SERIF BOLD": [ None, None, 0x1d7ec, "" ],
# no sans serif italic or bold italic
# no script or fraktur
"DOUBLE STRUCK": [ None, None, 0x1d7d8, "" ],
"MONOSPACE": [ None, None, 0x1d7f6, "" ],
"FULLWIDTH": [ None, None, 0x0ff110, "" ],
# no squared, negative squared, or regional indicator symbol
#"NEGATIVE CIRCLED": [ None, None, 0x024eb, "0" ],
"SUPERSCRIPT LATIN": [ None, None, 0x02070, "" ],
"SUBSCRIPT LATIN": [ None, None, 0x02080, "" ],
"DIGIT COMMA": [ None, None, 0x1f101, "" ],
"DIGIT FULL STOP": [ None, None, 0x02488, "" ],
# Starting at 1 (but offset is to where zero *would* be)
"CIRCLED": [ None, None, 0x0245f, "0" ],
"DINGBAT NEGATIVE CIRCLED": [ None, None, 0x02775, "0" ],
"DOUBLE CIRCLED": [ None, None, 0x024f3, "0" ],
"PARENTHESIZED": [ None, None, 0x02473, "0" ],
"FULL STOP": [ None, None, 0x02487, "0" ],
"DINGBAT CIRCLED SANS-SERIF": [ None, None, 0x0277f, "0" ],
"DINGBAT NEGATIVE CIRCLED SANS-SERIF": [ None, None, 0x02789, "0" ],
# circled number on black square 10-80 by 10 @ U+03248, 0 @ ????
"ARABIC-INDIC": [ None, None, 0x00660, "" ],
"EXTENDED ARABIC-INDIC": [ None, None, 0x006F0, "" ],
"NKO": [ None, None, 0x007c0, "" ],
"DEVANAGARI": [ None, None, 0x00966, "" ],
"BENGALI": [ None, None, 0x009e6, "" ],
"GURMUKHI": [ None, None, 0x00a66, "" ],
"GUJARATI": [ None, None, 0x00aE6, "" ],
"ORIYA": [ None, None, 0x00b66, "" ],
"TAMIL": [ None, None, 0x00bE6, "" ],
"TELUGU": [ None, None, 0x00c66, "" ],
"KANNADA": [ None, None, 0x00cE6, "" ],
"MALAYALAM": [ None, None, 0x00d66, "" ],
"SINHALA LITH": [ None, None, 0x00dE6, "" ],
"THAI": [ None, None, 0x00E50, "" ],
"LAO": [ None, None, 0x00Ed0, "" ],
"TIBETAN": [ None, None, 0x00f20, "" ],
"MYANMAR": [ None, None, 0x01040, "" ],
"MYANMAR SHAN": [ None, None, 0x01090, "" ],
"KHMER": [ None, None, 0x017e0, "" ],
"MONGOLIAN": [ None, None, 0x01810, "" ],
"LIMBU": [ None, None, 0x01946, "" ],
"NEW TAI LUE": [ None, None, 0x019d0, "" ],
"TAI THAM HORA": [ None, None, 0x01a80, "" ],
"TAI THAM THAM": [ None, None, 0x01a90, "" ],
"BALINESE": [ None, None, 0x01b50, "" ],
"SUNDANESE": [ None, None, 0x01bb0, "" ],
"LEPCHA": [ None, None, 0x01c40, "" ],
"OL CHIKI": [ None, None, 0x01c50, "" ],
"IDEOGRAPHIC NUMBER": [ None, None, 0x03007, "" ],
"VAI": [ None, None, 0x0a620, "" ],
"SAURASHTRA": [ None, None, 0x0a8d0, "" ],
"COMBINING DEVANAGARI": [ None, None, 0x0a8e0, "" ],
"KAYAH LI": [ None, None, 0x0a900, "" ],
"JAVANESE": [ None, None, 0x0a9d0, "" ],
"CHAM": [ None, None, 0x0aa50, "" ],
"MEETEI MAYEK": [ None, None, 0x0abf0, "" ],
# Some other related sets
"ROMAN NUMERAL": [ None, None, 0x0215f, "0" ],
"SMALL ROMAN NUMERAL": [ None, None, 0x0216f, "0" ],
"PLAYING CARDS, SPADE": [ None, None, 0x1f0a0, "0" ],
"PLAYING CARDS, HEART": [ None, None, 0x1f0b0, "0" ],
"PLAYING CARDS, DIAMOND": [ None, None, 0x1f0c0, "0" ],
"PLAYING CARDS, CLUB": [ None, None, 0x1f0d0, "0" ],
"MAHJONG TILES, CHARACTER": [ None, None, 0x1f006, "0" ],
"MAHJONG TILES, BAMBOO": [ None, None, 0x1f00f, "0" ],
"MAHJONG TILES, CIRCLE": [ None, None, 0x1f018, "0" ],
} # digitSets
# Map from expected but undefined code points, to where the char really is
#
exceptions = {
# expect actual name
0X02071: 0X000B9, # SUPERSCRIPT LATIN DIGIT ONE
0X02072: 0X000B2, # SUPERSCRIPT LATIN DIGIT TWO
0X02073: 0X000B3, # SUPERSCRIPT LATIN DIGIT THREE
#
0X1D455: 0X210E, # MATHEMATICAL ITALIC SMALL H (PLANCK CONSTANT)
0X1D49D: 0X212C, # MATHEMATICAL SCRIPT CAPITAL B
0X1D4A0: 0X2130, # MATHEMATICAL SCRIPT CAPITAL E
0X1D4A1: 0X2131, # MATHEMATICAL SCRIPT CAPITAL F
0X1D4A3: 0X210B, # MATHEMATICAL SCRIPT CAPITAL H
0X1D4A4: 0X2110, # MATHEMATICAL SCRIPT CAPITAL I
0X1D4A7: 0X2112, # MATHEMATICAL SCRIPT CAPITAL L
0X1D4A8: 0X2133, # MATHEMATICAL SCRIPT CAPITAL M
0X1D4AD: 0X211B, # MATHEMATICAL SCRIPT CAPITAL R
0X1D4BA: 0X212F, # MATHEMATICAL SCRIPT SMALL E
0X1D4BC: 0X0261, # MATHEMATICAL SCRIPT SMALL G
0X1D4C4: 0X2134, # MATHEMATICAL SCRIPT SMALL O
0X1D506: 0X212D, # MATHEMATICAL FRAKTUR CAPITAL C
0X1D50B: 0X210C, # MATHEMATICAL FRAKTUR CAPITAL H
0X1D50C: 0X2111, # MATHEMATICAL FRAKTUR CAPITAL I
0X1D515: 0X211C, # MATHEMATICAL FRAKTUR CAPITAL R
0X1D51D: 0X2128, # MATHEMATICAL FRAKTUR CAPITAL Z
0X1D53A: 0X2102, # MATHEMATICAL DOUBLE-STRUCK CAPITAL C
0X1D53F: 0X210D, # MATHEMATICAL DOUBLE-STRUCK CAPITAL H
0X1D545: 0X2115, # MATHEMATICAL DOUBLE-STRUCK CAPITAL N
0X1D547: 0X2119, # MATHEMATICAL DOUBLE-STRUCK CAPITAL P
0X1D548: 0X211A, # MATHEMATICAL DOUBLE-STRUCK CAPITAL Q
0X1D549: 0X211D, # MATHEMATICAL DOUBLE-STRUCK CAPITAL R
0X1D551: 0X2124, # MATHEMATICAL DOUBLE-STRUCK CAPITAL Z
# Missing or displaced zeros in digit sets (re-check)
0x0245f: 0x024ea, # CIRCLED DIGIT ZERO
0x02774: 0x024ff, # DINGBAT NEGATIVE CIRCLED DIGIT ZERO
0x024f4: None, # DOUBLE CIRCLED DIGIT ZERO
0x02789: None, # DINGBAT CIRCLED SANS-SERIF DIGIT ZERO
0x02775: None, # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO
0x02473: None, # PARENTHESIZED ??? DIGIT ZERO
0x024eb: 0x024ff, # NEGATIVE CIRCLED DIGIT ZERO
0x02488: 0x1f100, # FULL STOPPED DIGIT ZERO
0x024f5: None , # DOUBLE CIRCLED DIGIT ZERO
0x02780: 0x1f10b, # DINGBAT CIRCLED SANS-SERIF DIGIT ZERO
0x02776: 0x024ff, # DINGBAT NEGATIVE CIRCLED DIGIT ZERO
0x0278a: 0x1f10c, # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO
#
0x02160: None, # ROMAN NUMERALS DIGIT ZERO
0x02170: None, # SMALL ROMAN NUMERALS DIGIT ZERO
0x1f0a1: None, # PLAYING CARDS, SPADES DIGIT ZERO
0x1f0b1: None, # PLAYING CARDS, HEARTS DIGIT ZERO
0x1f0c1: None, # PLAYING CARDS, DIAMONDS DIGIT ZERO
0x1f0d1: None, # PLAYING CARDS, CLUBS DIGIT ZERO
0x1f007: None, # MAHJONG TILES, CHARACTERS DIGIT ZERO
0x1f010: None, # MAHJONG TILES, BAMBOOS DIGIT ZERO
0x1f019: None, # MAHJONG TILES, CIRCLES DIGIT ZERO
}
# Small caps should probably just apply to lowercase?
smallCapMap = {
"a": 0x01d00, # LATIN LETTER SMALL CAPITAL A
"b": 0x00299, # LATIN LETTER SMALL CAPITAL B (far)
"c": 0x01d04, # LATIN LETTER SMALL CAPITAL C
"d": 0x01d05, # LATIN LETTER SMALL CAPITAL D
"e": 0x01d07, # LATIN LETTER SMALL CAPITAL E
"f": 0x0a730, # LATIN LETTER SMALL CAPITAL F (far) Unicode 5.1 (2008)
"g": 0x00262, # LATIN LETTER SMALL CAPITAL G (far)
"h": 0x0029c, # LATIN LETTER SMALL CAPITAL H (far)
"i": 0x0026a, # LATIN LETTER SMALL CAPITAL I (far)
"j": 0x01d0a, # LATIN LETTER SMALL CAPITAL J
"k": 0x01d0b, # LATIN LETTER SMALL CAPITAL K
"l": 0x0029f, # LATIN LETTER SMALL CAPITAL L (far)
"m": 0x01d0d, # LATIN LETTER SMALL CAPITAL M
"n": 0x00274, # LATIN LETTER SMALL CAPITAL N (far)
"o": 0x01d0f, # LATIN LETTER SMALL CAPITAL O
"p": 0x01d18, # LATIN LETTER SMALL CAPITAL P
"q": 0x0A7Af, # LATIN LETTER SMALL CAPITAL Q (far) Unicode 11.0 (2018?)
"r": 0x00280, # LATIN LETTER SMALL CAPITAL R (far)
"s": 0x0a731, # LATIN LETTER SMALL CAPITAL S (far) Unicode 5.1 (2008)
"t": 0x01d1b, # LATIN LETTER SMALL CAPITAL T
"u": 0x01d1c, # LATIN LETTER SMALL CAPITAL U
"v": 0x01d20, # LATIN LETTER SMALL CAPITAL V
"w": 0x01d21, # LATIN LETTER SMALL CAPITAL W
#"x": None,
"y": 0x0028f, # LATIN LETTER SMALL CAPITAL Y (far)
"z": 0x01d22, # LATIN LETTER SMALL CAPITAL Z
}
# https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts#Uses,
subscriptMap = {
"a": 0x02090, # LATIN SUBSCRIPT SMALL LETTER A
#"b" beta?
#"c"
#"d"
"e": 0x02091, # LATIN SUBSCRIPT SMALL LETTER E
#"f"
#"g"
"h": 0x02095, # LATIN SUBSCRIPT SMALL LETTER H (MISSING on MAC?)... through T
"i": 0x01d62, # LATIN SUBSCRIPT SMALL LETTER I (far)
"j": 0x02c7c, # LATIN SUBSCRIPT SMALL LETTER J (far)
"k": 0x02096, # LATIN SUBSCRIPT SMALL LETTER K
"l": 0x02097, # LATIN SUBSCRIPT SMALL LETTER L
"m": 0x02098, # LATIN SUBSCRIPT SMALL LETTER M
"n": 0x02099, # LATIN SUBSCRIPT SMALL LETTER N
"o": 0x02092, # LATIN SUBSCRIPT SMALL LETTER O
"p": 0x0209a, # LATIN SUBSCRIPT SMALL LETTER P
#"q"
"r": 0x01d63, # LATIN SUBSCRIPT SMALL LETTER R (far)
"s": 0x0209b, # LATIN SUBSCRIPT SMALL LETTER S
"t": 0x0209c, # LATIN SUBSCRIPT SMALL LETTER T
"u": 0x01d64, # LATIN SUBSCRIPT SMALL LETTER U (far)
"v": 0x01d65, # LATIN SUBSCRIPT SMALL LETTER V (far)
#"w"
"x": 0x02093, # LATIN SUBSCRIPT SMALL LETTER X (far)
#"y"
#"z"
}
# Greek subscripts: bgrfx 0x1d62...0x1d6a aeoxhklmnpst
# combining diacriticals marks has aeioucdhmrtvx
# See https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts
# Combining Diacritical Marks Supplement has most of the rest, but
# we"d need to insert something to place them over
# Phonetic Extensions and Phonetic Extensions Supplement have a bunch
superscriptMap = { # and +-=()
#"a": Feminine ordinal indicator
"i": 0x02071,
"n": 0x0207f,
#"o": Masculine ordinal indicator
#"v": In LAtin Extended-C
}
# Turned/rotated characters. Uppercase are mainly based on "Fraser" orthography,
# for "Lisu" script.
#
lisuMap = {
# # Name, alternatives?
"A": 0x0a4ef, # U+02c6f LATIN CAPITAL LETTER TURNED A
"B": 0x0a4ed, #