diff --git a/README.md b/README.md index 4e8490d..3bcc038 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Import as any other module using the name `greeklt` - **capitalize(word)** -Makes a word all caps without accentuation but adds dieresis when necessary: +Makes a word all caps without accentuation, adding dieresis when necessary: ``` word = "γάιδαρος" @@ -25,7 +25,7 @@ word = "γάιδαρος" ΓΑΪΔΑΡΟΣ ``` -- **remove_accentuation(word)** +- **remove_accentuation(word, modifier=0)** Removes accentuation but adds dieresis when necessary, without capitalizing: @@ -36,7 +36,7 @@ word = "γάιδαρος" γαϊδαρος ``` -Works exceptionally well when you want to sort a list aphabetically and not based on unicode: +Works exceptionally well when you want to sort a list alphabetically and not based on unicode: ``` cities = ["Όσλο", "Λευκωσία", "Άκαμπα", "Ζυρίχη", "Ρώμη"] @@ -47,7 +47,14 @@ cities = ["Όσλο", "Λευκωσία", "Άκαμπα", "Ζυρίχη", "Ρώ >>> sorted(cities, key=remove_accentuation) ["Άκαμπα", "Ζυρίχη", "Λευκωσία", "Όσλο", "Ρώμη"] ``` +Also contains an option to never add dieresis, enabled by setting modifier to 1: +``` +string = "Αιδόνι" + +>>> print(remove_accentuation(string)) +Αιδονι +``` - **convert_final_s(word)** Checks last letter of each word in a string. If it is a `σ` it is converted into a `ς` (final `σ`): @@ -67,6 +74,7 @@ Transliterates a string written with latin characters into it's equivalent Greek ``` This can come quite in handy when a user forgets to change the language and the word looks the same both in latin and Greek: + ``` # ANNA written in latin (Anna) name = "ANNA" @@ -84,17 +92,13 @@ There's also the abillity to convert a word from latin to it's intended accentua ``` >>> greek_transliteration("P;ita soybl;aki") Πίτα σουβλάκι - >>> greek_transliteration("kaWiki") καΐκι - >>> greek_transliteration("pro:yp;ouesh") προϋπόθεση - >>> greek_transliteration("GA:IDAROS") ΓΑΪΔΑΡΟΣ ``` - Note: The function takes as given that the user intended to write the work in Greek using the correct key sequence but just didn't switch their keyboard to Greek. It doesn't convert from Greeklish! ``` @@ -102,4 +106,18 @@ Note: The function takes as given that the user intended to write the work in Gr # They're supposed to press SHIFT + W and not just w for the ΅ character to appear. >>> greek_transliteration("kawiki") καςικι +``` + +- **greek_elot_Transliteration(word/sentence)** + +Transliterates a string according to the ELOT 743 standard, frequently used in formal transcription settings (e.g passports, ID, etc) + +``` + +name = "Ελευθέριος Βενιζέλος" + +>>> latin_name = greek_elot_transliteration(name) +>>> print(latin_name) +Eleftherios Venizelos + ``` \ No newline at end of file diff --git a/src/greeklt/greek_elot_transliteration.py b/src/greeklt/greek_elot_transliteration.py new file mode 100644 index 0000000..5539a2d --- /dev/null +++ b/src/greeklt/greek_elot_transliteration.py @@ -0,0 +1,231 @@ +def greek_elot_transliteration(string): + from remove_accentuation import remove_accentuation +# reference_string = string + string = remove_accentuation(string, 1) + lowercase = { + 'α': 'a', + 'β': 'v', + 'γ': 'g', + 'δ': 'd', + 'ε': 'e', + 'ζ': 'z', + 'η': 'i', + 'θ': 'th', + 'ι': 'i', + 'κ': 'k', + 'λ': 'l', + 'μ': 'm', + 'ν': 'n', + 'ξ': 'x', + 'ο': 'o', + 'π': 'p', + 'ρ': 'r', + 'σ': 's', + 'τ': 't', + 'υ': 'y', + 'φ': 'f', + 'χ': 'ch', + 'ψ': 'ps', + 'ω': 'o' + } + caps = { + 'Α': 'A', + 'Β': 'V', + 'Γ': 'G', + 'Δ': 'D', + 'Ε': 'E', + 'Ζ': 'Z', + 'Η': 'I', + 'Θ': 'TH', + 'Ι': 'I', + 'Κ': 'K', + 'Λ': 'L', + 'Μ': 'M', + 'Ν': 'N', + 'Ξ': 'X', + 'Ο': 'O', + 'Π': 'P', + 'Ρ': 'R', + 'Σ': 'S', + 'Τ': 'T', + 'Υ': 'Y', + 'Φ': 'F', + 'Χ': 'CH', + 'Ψ': 'PS', + 'Ω': 'O' + } + # Simple digraphs with no extra rules or edge cases + # No need for mixed casing "Γγ" or "Γξ" against Greek phonology + el_simple_digraphs = [ + 'γγ', + 'γξ', + 'γχ', + 'ου' + ] + eng_simple_digraphs = [ + 'ng', + 'nx', + 'nch', + 'ou' + ] + el_simple_cap_digraphs = [ + "ΓΓ", + "ΓΞ", + "ΓΧ", + "ΟΥ" + ] + eng_simple_cap_digraphs = [ + "NG", + "NX", + "NCH", + "OU" + ] + + el_mono_digraph_sub = [ + "TH", + "CH", + "PS" + ] +# Accent based digraphs +# el_low_acc_digraphs = [ +# "άυ", +# "αϋ", +# +# "έυ", +# "εϋ", +# +# "ήυ", +# "ηϋ" +# +# ] +# el_mix_acc_digraphs = [ +# "Άυ", +# "Αϋ", +# +# "Έυ", +# "Εϋ", +# +# "Ήυ", +# "Ηϋ" +# ] +# el_cap_acc_digraphs = [ +# "ΆΥ", +# "ΑΫ", +# +# "ΈΥ", +# "ΕΫ", +# +# "ΉΥ", +# "ΗΫ" +# ] + el_mp_digraph = [ + "ΜΠ", + "Μπ", + "μπ" + ] + eng_mp_digraph_0 = [ + "B", + "B", + "b" + ] + eng_mp_digraph_1 = [ + "MP", + "Mp", + "mp" + ] + el_xu_digraphs = [ + "αυ", + "ευ", + "ηυ" + ] + eng_xu_digraphs_v = [ + "av", + "ev", + "iv" + ] + eng_xu_digraphs_f = [ + "af", + "ef", + "if" + ] + xu_sound_modifiers_v = [ + + "β", + "γ", + "δ", + "ζ", + "λ", + "μ", + "ν", + "ρ", + + "α", + "ε", + "η", + "ι", + "ο", + "υ", + "ω" + ] +# +empty space (accounted for in code) + xu_sound_modifiers_f = [ + + "θ", + "κ", + "ξ", + "π", + "σ", + "τ", + "φ", + "χ", + "ψ" + ] +# Replace ς with σ + prep_string = string.replace("ς", "σ") +# if el_low_acc_digraphs or el_mix_acc_digraphs or el_cap_acc_digraphs in string: +# Do nothing, we don't care with current implementation +# Prepare the Unicode tables for use with translate() + lowercase = string.maketrans(lowercase) + caps = string.maketrans(caps) +# reference_string_list = reference_string.split(" ") + new_string_list = prep_string.split(" ") + output = "" + for new_string in new_string_list: + # Replace all digraphs, so they're ignored by the simple transcription + for i in el_simple_digraphs: + if i in string: + new_string = new_string.replace(i, eng_simple_digraphs[el_simple_digraphs.index(i)]) + for i in el_simple_cap_digraphs: + if i in string: + new_string = new_string.replace(i, eng_simple_cap_digraphs[el_simple_cap_digraphs.index(i)]) +# Check which "mp" sound to use depending on if it's at word start + for i in el_mp_digraph: + if i in string: + if string.startswith(i): + new_string = new_string.replace(i, eng_mp_digraph_0[el_mp_digraph.index(i)], 1) + new_string = new_string.replace(i, eng_mp_digraph_1[el_mp_digraph.index(i)]) +# Check what VOWEL+"υ" should transliterate to depending on the following letter. + for i in el_xu_digraphs: + if i in new_string: + if len(new_string) > 2: # Make sure we're not calling an out of range index + for loop in xu_sound_modifiers_f: + if new_string[new_string.find(i)+2] in loop: + new_string = new_string.replace(i, eng_xu_digraphs_f[el_xu_digraphs.index(i)]) + + for loop in xu_sound_modifiers_v: + if new_string[new_string.find(i)+2] in loop: + new_string = new_string.replace(i, eng_xu_digraphs_v[el_xu_digraphs.index(i)]) + if len(new_string) == 2: # Account for VOWEL+"υ" at end of sentence + new_string = new_string.replace(i, eng_xu_digraphs_f[el_xu_digraphs.index(i)]) + +# Simple transcription + new_string = new_string.translate(caps) + new_string = new_string.translate(lowercase) +# Normalize capital letters if needed + for i in el_mono_digraph_sub: + if new_string.startswith(i): + if new_string[3].islower() is True: + new_string = new_string.replace(new_string[1], new_string[1].lower()) + new_string += " " + output += new_string + return output diff --git a/src/greeklt/remove_accentuation.py b/src/greeklt/remove_accentuation.py index 978dd54..05d7eaa 100644 --- a/src/greeklt/remove_accentuation.py +++ b/src/greeklt/remove_accentuation.py @@ -1,4 +1,4 @@ -def remove_accentuation(string: str): +def remove_accentuation(string: str, modifier=0): accents = { "ά": "α", "έ": "ε", @@ -17,14 +17,19 @@ def remove_accentuation(string: str): "Ώ": "Ω", } dieresis = {"ι": "ϊ", "υ": "ϋ"} + dieresis_reverse = {"ϊ": "ι", "ϋ": "υ"} new_string = "" prev_char = 0 for c in string: char = c if c in accents.keys(): char = accents[c] - if c in dieresis.keys() and prev_char in ("ά", "ό", "έ"): - char = dieresis[c] + if modifier == 0: + if c in dieresis.keys() and prev_char in ("ά", "ό", "έ"): + char = dieresis[c] + if modifier == 1: # Remove dieresis + if c in dieresis_reverse.keys(): + char = dieresis_reverse[c] prev_char = c new_string += char return new_string diff --git a/tests/test_tests.py b/tests/test_tests.py index 8747221..3869686 100644 --- a/tests/test_tests.py +++ b/tests/test_tests.py @@ -2,6 +2,7 @@ from src.greeklt.remove_accentuation import remove_accentuation from src.greeklt.greek_transliteration import greek_transliteration from src.greeklt.convert_final_s import convert_final_s +from src.greeklt.greek_elot_transliteration import greek_elot_transliteration def test_capitalize(): @@ -28,3 +29,11 @@ def test_transilteration(): def test_convert_final_s(): assert convert_final_s("Φάροσ ΦΑΡΟΣ φάρος") == "Φάρος ΦΑΡΟΣ φάρος" + + +def test_elot_transliteration(): + + assert greek_elot_transliteration("αυγό") == "avgo", "Failure in vowel+υ conversion, v" + assert greek_elot_transliteration("αυτο") == "afto", "Failure in vowel+υ conversion, f" + assert greek_elot_transliteration("αγγελος το αγχος του") == "angelos to anchos tou", "Failure in simple diphthongs" + assert greek_elot_transliteration("Θανασης") == "Thanasis", "Failure in diphthong capital normalization"