# Unicode
Normalization and regexes

In [1]:
__author__ = "Pierre Nugues"

## We import `regex`
The `re` module does not support Unicode well. We use `regex` instead

In [2]:
import regex as re
import unicodedata

## Code points

In [3]:
'\N{LATIN CAPITAL LETTER C}'

'C'

In [4]:
'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}'

'Ê'

In [5]:
'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}' == 'Ê'


True

In [6]:
'\N{GREEK CAPITAL LETTER GAMMA}'

'Γ'

In [7]:
ord('C'), ord('Γ')

(67, 915)

In [8]:
chr(67), chr(915)

('C', 'Γ')

In [9]:
hex(67), hex(915)

('0x43', '0x393')

## Composing characters

In [18]:
e_1 = '\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}'
e_1

'Ê'

In [19]:
e_2 = '\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}'
e_2

'Ê'

Visually equivalent, but are they equal?

In [20]:
e_1 == e_2

False

In [21]:
[hex(ord(cp)) for cp in e_1]

['0xca']

In [22]:
[hex(ord(cp)) for cp in e_2]

['0x45', '0x302']

## Normalization

In [23]:
unicodedata.decomposition(e_1)

'0045 0302'

In [24]:
[hex(ord(cp)) for cp in unicodedata.normalize('NFD', e_1)]

['0x45', '0x302']

In [26]:
[hex(ord(cp)) for cp in unicodedata.normalize('NFD', e_2)]

['0x45', '0x302']

In [27]:
[hex(ord(cp)) for cp in unicodedata.normalize('NFC', e_1)]

['0xca']

In [28]:
[hex(ord(cp)) for cp in unicodedata.normalize('NFC', e_2)]

['0xca']

In [29]:
unicodedata.normalize('NFC', e_1) == unicodedata.normalize('NFC', e_2)

True

## Unicode Database

In [30]:
c = 'Γ'

In [31]:
ord(c), unicodedata.name(c), unicodedata.category(c)

(915, 'GREEK CAPITAL LETTER GAMMA', 'Lu')

### Western or Eastern Empire?

In [32]:
alphabet = 'αβγδεζηθικλμνξοπρστυφχψω'
match = re.search(r'^\p{InBasic_Latin}+$', alphabet)
match  # None

#### Eastern!

In [33]:
match = re.search(r'^\p{InGreek_and_Coptic}+$', alphabet)
match  # matches alphabet

<regex.Match object; span=(0, 24), match='αβγδεζηθικλμνξοπρστυφχψω'>

### Ἑλληνική

In [34]:
match = re.search(r'^\p{Greek}+$', alphabet)
match  # matches alphabet

<regex.Match object; span=(0, 24), match='αβγδεζηθικλμνξοπρστυφχψω'>

#### Searching with Unicode code points

In [35]:
match = re.search(r'\N{GREEK SMALL LETTER ALPHA}\N{GREEK SMALL LETTER BETA}', alphabet)
match  # matches 'αβ'

<regex.Match object; span=(0, 2), match='αβ'>

#### Searching a string

In [36]:
match = re.search('αβ', alphabet)
match  # matches 'αβ'

<regex.Match object; span=(0, 2), match='αβ'>