From 718917e187fbc4b802104e7458822665b309dac1 Mon Sep 17 00:00:00 2001 From: athenionn Date: Wed, 4 Jan 2023 21:28:39 +0200 Subject: [PATCH 1/8] WIP: Doesn't work properly. --- src/greeklt/greek_iso_transliteration.py | 256 +++++++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 src/greeklt/greek_iso_transliteration.py diff --git a/src/greeklt/greek_iso_transliteration.py b/src/greeklt/greek_iso_transliteration.py new file mode 100644 index 0000000..9b84dac --- /dev/null +++ b/src/greeklt/greek_iso_transliteration.py @@ -0,0 +1,256 @@ +def greek_iso_transliteration(string: str): + el_low = [ + "α" + "β" + "γ" + "δ" + "ε" + "ζ" + "η" + "θ" + "ι" + "κ" + "λ" + "μ" + "ν" + "ξ" + "ο" + "π" + "ρ" + "σς" + "τ" + "υ" + "φ" + "χ" + "ψ" + "ω" + ] + el_cap = [ + "Α" + "Β" + "Γ" + "Δ" + "Ε" + "Ζ" + "Η" + "Θ" + "Ι" + "Κ" + "Λ" + "Μ" + "Ν" + "Ξ" + "Ο" + "Π" + "Ρ" + "Σ" + "Τ" + "Υ" + "Φ" + "Χ" + "Ψ" + "Ω" + ] + # TODO FIX Psari, not PSari + eng_low = [ + "a" + "v" + "g" + "d" + "e" + "z" + "i" + "th" + "i" + "k" + "l" + "m" + "n" + "x" + "o" + "p" + "r" + "s" + "t" + "y" + "f" + + "ch" + "ps" + "o" + ] + eng_cap = [ + "A" + "V" + "G" + "D" + "E" + "Z" + "I" + "TH" + "I" + "K" + "L" + "M" + "N" + "X" + "O" + "P" + "R" + "S" + "T" + "Y" + "F" + "CH" + "PS" + "O" + ] + # Simple digraphs with no extra rules or edge cases + # No need for mixed casing "Γγ" or "Γξ" against Greek phonology + el_simple_digraphs = [ + "γγ" + "γξ" + "γχ" + ] + el_simple_cap_digraphs = [ + "ΓΓ" + "ΓΞ" + "ΓΧ" + ] + eng_simple_digraphs = [ + "ng" + "nx" + "nch" + ] + eng_simple_cap_digraphs = [ + "NG" + "NX" + "NCH" + ] +# Accent based digraphs + el_low_acc_digraphs = [ + "άυ" + "αϋ" + + "έυ" + "εϋ" + + "ήυ" + "ηϋ" + + ] + el_mix_acc_digraphs = [ + "Άυ" + "Αϋ" + + "Έυ" + "Εϋ" + + "Ήυ" + "Ηϋ" + ] + el_cap_acc_digraphs = [ + "ΆΥ" + "ΑΫ" + + "ΈΥ" + "ΕΫ" + + "ΉΥ" + "ΗΫ" + ] + el_mp_digraph = [ + "ΜΠ" + "Μπ" + "μπ" + ] + eng_mp_digraph_0 = [ + "B" + "B" + "b" + ] + eng_mp_digraph_1 = [ + "MP" + "Mp" + "mp" + ] + el_xu_digraphs = [ + "αυ" + "ευ" + "ηυ" + ] + eng_xu_digraphs = [ + "av" + "af" + "ev" + "ef" + "iv" + "if" + ] +# List related to xu lists + xu_sound_modifiers_v = [ +# β, γ, δ, ζ, λ, μ, ν, ρ, α, ε, η, ι, ο, υ, ω + "β" + "γ" + "δ" + "ζ" + "λ" + "μ" + "ν" + "ρ" + + "α" + "ε" + "η" + "ι" + "ο" + "υ" + "ω" + ] + xu_sound_modifiers_f = [ +# θ, κ, ξ, π, σ, τ, φ, χ, ψ, empty space + "θ" + "κ" + "ξ" + "π" + "σ" + "τ" + "φ" + "χ" + "ψ" + ] + print(string) + new_string = string +# if el_low_acc_digraphs or el_mix_acc_digraphs or el_cap_acc_digraphs in string: +# Do nothing, we don't care +# Replace all digraphs so they're ignored by the simple transcription + for i in el_simple_digraphs: + if i in string: + new_string.replace(i, eng_simple_digraphs[el_simple_digraphs.index(i)]) + for i in el_simple_cap_digraphs: + if i in string: + new_string.replace(i, eng_simple_cap_digraphs[el_simple_digraphs.index(i)]) +# At the moment we don't care, but in future we need to account for accents too +# for i in el_low_acc_digraphs +# if i in string: +# new_string.replace(i, ) + for i in el_mp_digraph: + if i in string: + if string.startswith(i): + new_string.replace(i, eng_mp_digraph_0[el_mp_digraph.index(i)], 1) + new_string.replace(i, eng_mp_digraph_1[el_mp_digraph.index(i)]) + for i in el_xu_digraphs: + if i in string: + for loop in xu_sound_modifiers_f: + if string[string.find(i)+1] in loop: + new_string.replace(i, eng_xu_digraphs[el_xu_digraphs.index(i)+1]) + for loop in xu_sound_modifiers_v: + if string[string.find(i)+1] in loop: + new_string.replace(i, eng_xu_digraphs[el_xu_digraphs.index(i)]) + for i in el_low: + if i in el_low: + new_string.replace(i, eng_low[el_low.index(i)]) + for i in el_cap: + if i in el_cap: + new_string.replace(i, eng_cap[el_cap.index(i)]) + return new_string From 65f3a64719466c3f755f9fe4e54551a0533491a1 Mon Sep 17 00:00:00 2001 From: athenionn Date: Wed, 4 Jan 2023 22:47:26 +0200 Subject: [PATCH 2/8] FIXED IT! SHOULD WORK FINE! --- src/greeklt/greek_iso_transliteration.py | 320 ++++++++++++----------- 1 file changed, 162 insertions(+), 158 deletions(-) diff --git a/src/greeklt/greek_iso_transliteration.py b/src/greeklt/greek_iso_transliteration.py index 9b84dac..beabb20 100644 --- a/src/greeklt/greek_iso_transliteration.py +++ b/src/greeklt/greek_iso_transliteration.py @@ -1,235 +1,238 @@ def greek_iso_transliteration(string: str): el_low = [ - "α" - "β" - "γ" - "δ" - "ε" - "ζ" - "η" - "θ" - "ι" - "κ" - "λ" - "μ" - "ν" - "ξ" - "ο" - "π" - "ρ" - "σς" - "τ" - "υ" - "φ" - "χ" - "ψ" + "α", + "β", + "γ", + "δ", + "ε", + "ζ", + "η", + "θ", + "ι", + "κ", + "λ", + "μ", + "ν", + "ξ", + "ο", + "π", + "ρ", + "σ", + "τ", + "υ", + "φ", + "χ", + "ψ", "ω" ] el_cap = [ - "Α" - "Β" - "Γ" - "Δ" - "Ε" - "Ζ" - "Η" - "Θ" - "Ι" - "Κ" - "Λ" - "Μ" - "Ν" - "Ξ" - "Ο" - "Π" - "Ρ" - "Σ" - "Τ" - "Υ" - "Φ" - "Χ" - "Ψ" + "Α", + "Β", + "Γ", + "Δ", + "Ε", + "Ζ", + "Η", + "Θ", + "Ι", + "Κ", + "Λ", + "Μ", + "Ν", + "Ξ", + "Ο", + "Π", + "Ρ", + "Σ", + "Τ", + "Υ", + "Φ", + "Χ", + "Ψ", "Ω" ] # TODO FIX Psari, not PSari eng_low = [ - "a" - "v" - "g" - "d" - "e" - "z" - "i" - "th" - "i" - "k" - "l" - "m" - "n" - "x" - "o" - "p" - "r" - "s" - "t" - "y" - "f" + "a", + "v", + "g", + "d", + "e", + "z", + "i", + "th", + "i", + "k", + "l", + "m", + "n", + "x", + "o", + "p", + "r", + "s", + "t", + "y", + "f", - "ch" - "ps" + "ch", + "ps", "o" ] eng_cap = [ - "A" - "V" - "G" - "D" - "E" - "Z" - "I" - "TH" - "I" - "K" - "L" - "M" - "N" - "X" - "O" - "P" - "R" - "S" - "T" - "Y" - "F" - "CH" - "PS" + "A", + "V", + "G", + "D", + "E", + "Z", + "I", + "TH", + "I", + "K", + "L", + "M", + "N", + "X", + "O", + "P", + "R", + "S", + "T", + "Y", + "F", + "CH", + "PS", "O" ] # Simple digraphs with no extra rules or edge cases # No need for mixed casing "Γγ" or "Γξ" against Greek phonology el_simple_digraphs = [ - "γγ" - "γξ" + "γγ", + "γξ", "γχ" ] el_simple_cap_digraphs = [ - "ΓΓ" - "ΓΞ" + "ΓΓ", + "ΓΞ", "ΓΧ" ] eng_simple_digraphs = [ - "ng" - "nx" + "ng", + "nx", "nch" ] eng_simple_cap_digraphs = [ - "NG" - "NX" + "NG", + "NX", "NCH" ] # Accent based digraphs el_low_acc_digraphs = [ - "άυ" - "αϋ" + "άυ", + "αϋ", - "έυ" - "εϋ" + "έυ", + "εϋ", - "ήυ" + "ήυ", "ηϋ" ] el_mix_acc_digraphs = [ - "Άυ" - "Αϋ" + "Άυ", + "Αϋ", - "Έυ" - "Εϋ" + "Έυ", + "Εϋ", - "Ήυ" + "Ήυ", "Ηϋ" ] el_cap_acc_digraphs = [ - "ΆΥ" - "ΑΫ" + "ΆΥ", + "ΑΫ", - "ΈΥ" - "ΕΫ" + "ΈΥ", + "ΕΫ", - "ΉΥ" + "ΉΥ", "ΗΫ" ] el_mp_digraph = [ - "ΜΠ" - "Μπ" + "ΜΠ", + "Μπ", "μπ" ] eng_mp_digraph_0 = [ - "B" - "B" + "B", + "B", "b" ] eng_mp_digraph_1 = [ - "MP" - "Mp" + "MP", + "Mp", "mp" ] el_xu_digraphs = [ - "αυ" - "ευ" + "αυ", + "ευ", "ηυ" ] eng_xu_digraphs = [ - "av" - "af" - "ev" - "ef" - "iv" + "av", + "af", + "ev", + "ef", + "iv", "if" ] # List related to xu lists xu_sound_modifiers_v = [ # β, γ, δ, ζ, λ, μ, ν, ρ, α, ε, η, ι, ο, υ, ω - "β" - "γ" - "δ" - "ζ" - "λ" - "μ" - "ν" - "ρ" + "β", + "γ", + "δ", + "ζ", + "λ", + "μ", + "ν", + "ρ", - "α" - "ε" - "η" - "ι" - "ο" - "υ" + "α", + "ε", + "η", + "ι", + "ο", + "υ", "ω" ] xu_sound_modifiers_f = [ # θ, κ, ξ, π, σ, τ, φ, χ, ψ, empty space - "θ" - "κ" - "ξ" - "π" - "σ" - "τ" - "φ" - "χ" + "θ", + "κ", + "ξ", + "π", + "σ", + "τ", + "φ", + "χ", "ψ" ] print(string) new_string = string +# Program can't handle ς, replace it + new_string = new_string.replace("ς","σ") # if el_low_acc_digraphs or el_mix_acc_digraphs or el_cap_acc_digraphs in string: # Do nothing, we don't care # Replace all digraphs so they're ignored by the simple transcription for i in el_simple_digraphs: if i in string: - new_string.replace(i, eng_simple_digraphs[el_simple_digraphs.index(i)]) + print(new_string.replace(i, eng_simple_digraphs[el_simple_digraphs.index(i)])) + new_string = new_string.replace(i, eng_simple_digraphs[el_simple_digraphs.index(i)]) for i in el_simple_cap_digraphs: if i in string: - new_string.replace(i, eng_simple_cap_digraphs[el_simple_digraphs.index(i)]) + new_string = new_string.replace(i, eng_simple_cap_digraphs[el_simple_digraphs.index(i)]) # At the moment we don't care, but in future we need to account for accents too # for i in el_low_acc_digraphs # if i in string: @@ -237,20 +240,21 @@ def greek_iso_transliteration(string: str): for i in el_mp_digraph: if i in string: if string.startswith(i): - new_string.replace(i, eng_mp_digraph_0[el_mp_digraph.index(i)], 1) - new_string.replace(i, eng_mp_digraph_1[el_mp_digraph.index(i)]) + new_string = new_string.replace(i, eng_mp_digraph_0[el_mp_digraph.index(i)], 1) + new_string = new_string.replace(i, eng_mp_digraph_1[el_mp_digraph.index(i)]) for i in el_xu_digraphs: if i in string: for loop in xu_sound_modifiers_f: if string[string.find(i)+1] in loop: - new_string.replace(i, eng_xu_digraphs[el_xu_digraphs.index(i)+1]) + new_string = new_string.replace(i, eng_xu_digraphs[el_xu_digraphs.index(i)+1]) for loop in xu_sound_modifiers_v: if string[string.find(i)+1] in loop: - new_string.replace(i, eng_xu_digraphs[el_xu_digraphs.index(i)]) + new_string = new_string.replace(i, eng_xu_digraphs[el_xu_digraphs.index(i)]) for i in el_low: if i in el_low: - new_string.replace(i, eng_low[el_low.index(i)]) + new_string = new_string.replace(i, eng_low[el_low.index(i)]) for i in el_cap: if i in el_cap: - new_string.replace(i, eng_cap[el_cap.index(i)]) + new_string = new_string.replace(i, eng_cap[el_cap.index(i)]) return new_string +print(greek_iso_transliteration("Μπαμπας")) From b3561ba2a33c1e731970d24bb8af6811a3727604 Mon Sep 17 00:00:00 2001 From: athenionn Date: Wed, 4 Jan 2023 22:57:40 +0200 Subject: [PATCH 3/8] FIXED IT! SHOULD WORK FINE! --- src/greeklt/greek_iso_transliteration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/greeklt/greek_iso_transliteration.py b/src/greeklt/greek_iso_transliteration.py index beabb20..5cabeba 100644 --- a/src/greeklt/greek_iso_transliteration.py +++ b/src/greeklt/greek_iso_transliteration.py @@ -257,4 +257,3 @@ def greek_iso_transliteration(string: str): if i in el_cap: new_string = new_string.replace(i, eng_cap[el_cap.index(i)]) return new_string -print(greek_iso_transliteration("Μπαμπας")) From 20247564a6ce55014c8d4f27bfb369e7b9a3950d Mon Sep 17 00:00:00 2001 From: athenionn Date: Thu, 5 Jan 2023 19:09:18 +0200 Subject: [PATCH 4/8] =?UTF-8?q?Bugfixing,=20cleaned=20up,=20added=20mixed-?= =?UTF-8?q?casing=20support=20(=CE=A8=CE=B1=CF=81=CE=B9=20is=20Psari,=20no?= =?UTF-8?q?t=20PSari,=20but=20=CE=A8=CE=91=CE=A1=CE=99=20is=20still=20PSAR?= =?UTF-8?q?I)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/greeklt/greek_iso_transliteration.py | 85 ++++++++++++++---------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/src/greeklt/greek_iso_transliteration.py b/src/greeklt/greek_iso_transliteration.py index 5cabeba..afa517d 100644 --- a/src/greeklt/greek_iso_transliteration.py +++ b/src/greeklt/greek_iso_transliteration.py @@ -1,4 +1,5 @@ def greek_iso_transliteration(string: str): +# This should all probably be a CSV or something el_low = [ "α", "β", @@ -127,38 +128,44 @@ def greek_iso_transliteration(string: str): "NX", "NCH" ] -# Accent based digraphs - el_low_acc_digraphs = [ - "άυ", - "αϋ", - - "έυ", - "εϋ", - - "ήυ", - "ηϋ" + el_mono_digraph_sub = [ + "Θ", + "Χ", + "Ψ" ] - el_mix_acc_digraphs = [ - "Άυ", - "Αϋ", - - "Έυ", - "Εϋ", - - "Ήυ", - "Ηϋ" - ] - el_cap_acc_digraphs = [ - "ΆΥ", - "ΑΫ", - - "ΈΥ", - "ΕΫ", - - "ΉΥ", - "ΗΫ" - ] +# Accent based digraphs +# el_low_acc_digraphs = [ +# "άυ", +# "αϋ", +# +# "έυ", +# "εϋ", +# +# "ήυ", +# "ηϋ" +# +# ] +# el_mix_acc_digraphs = [ +# "Άυ", +# "Αϋ", +# +# "Έυ", +# "Εϋ", +# +# "Ήυ", +# "Ηϋ" +# ] +# el_cap_acc_digraphs = [ +# "ΆΥ", +# "ΑΫ", +# +# "ΈΥ", +# "ΕΫ", +# +# "ΉΥ", +# "ΗΫ" +# ] el_mp_digraph = [ "ΜΠ", "Μπ", @@ -221,10 +228,11 @@ def greek_iso_transliteration(string: str): ] print(string) new_string = string -# Program can't handle ς, replace it +# Replace ς with σ new_string = new_string.replace("ς","σ") # if el_low_acc_digraphs or el_mix_acc_digraphs or el_cap_acc_digraphs in string: -# Do nothing, we don't care +# Do nothing, we don't care with current implementation + # Replace all digraphs so they're ignored by the simple transcription for i in el_simple_digraphs: if i in string: @@ -233,10 +241,7 @@ def greek_iso_transliteration(string: str): for i in el_simple_cap_digraphs: if i in string: new_string = new_string.replace(i, eng_simple_cap_digraphs[el_simple_digraphs.index(i)]) -# At the moment we don't care, but in future we need to account for accents too -# for i in el_low_acc_digraphs -# if i in string: -# new_string.replace(i, ) +# TODO: ROMANIZE ACCENTS for i in el_mp_digraph: if i in string: if string.startswith(i): @@ -250,10 +255,18 @@ def greek_iso_transliteration(string: str): for loop in xu_sound_modifiers_v: if string[string.find(i)+1] in loop: new_string = new_string.replace(i, eng_xu_digraphs[el_xu_digraphs.index(i)]) +# Simple transliteration for i in el_low: if i in el_low: new_string = new_string.replace(i, eng_low[el_low.index(i)]) for i in el_cap: if i in el_cap: new_string = new_string.replace(i, eng_cap[el_cap.index(i)]) + #Normalize capital letters if needed + print(string) + for i in el_mono_digraph_sub: + if string.startswith(i): + if string[2].islower() == True: + new_string = new_string.replace(new_string[1], new_string[1].lower()) return new_string + From a5d8fbe64bbb5fa195fd37bebd37edcb9e2115e8 Mon Sep 17 00:00:00 2001 From: athenionn Date: Wed, 11 Jan 2023 22:25:30 +0200 Subject: [PATCH 5/8] =?UTF-8?q?Bugfixing,=20cleaned=20up,=20added=20mixed-?= =?UTF-8?q?casing=20support=20(=CE=A8=CE=B1=CF=81=CE=B9=20is=20Psari,=20no?= =?UTF-8?q?t=20PSari,=20but=20=CE=A8=CE=91=CE=A1=CE=99=20is=20still=20PSAR?= =?UTF-8?q?I)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...ek_iso_transliteration.py => greek_elot_transliteration.py} | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) rename src/greeklt/{greek_iso_transliteration.py => greek_elot_transliteration.py} (99%) diff --git a/src/greeklt/greek_iso_transliteration.py b/src/greeklt/greek_elot_transliteration.py similarity index 99% rename from src/greeklt/greek_iso_transliteration.py rename to src/greeklt/greek_elot_transliteration.py index afa517d..6eaa6cd 100644 --- a/src/greeklt/greek_iso_transliteration.py +++ b/src/greeklt/greek_elot_transliteration.py @@ -1,4 +1,4 @@ -def greek_iso_transliteration(string: str): +def greek_elot_transliteration(string: str): # This should all probably be a CSV or something el_low = [ "α", @@ -269,4 +269,3 @@ def greek_iso_transliteration(string: str): if string[2].islower() == True: new_string = new_string.replace(new_string[1], new_string[1].lower()) return new_string - From 0815eec8bebaf9005b6ab10cba4fb747985995c8 Mon Sep 17 00:00:00 2001 From: athenionn Date: Thu, 12 Jan 2023 19:53:27 +0200 Subject: [PATCH 6/8] =?UTF-8?q?Added=20capability=20for=20transliterating?= =?UTF-8?q?=20entire=20sentences.=20Fixed=20bugs=20that=20arose=20from=20t?= =?UTF-8?q?hat.=20Made=20lists=20into=20dictionary=20and=20simplified=20th?= =?UTF-8?q?e=20"dumb"=20transliteration=20code=20thanks=20to=20that.=20Use?= =?UTF-8?q?d=20remove=5Faccentuation=20to=20allow=20for=20input=20of=20acc?= =?UTF-8?q?ented=20sentences.=20Added=20proper=20transliteration=20for=20"?= =?UTF-8?q?=CE=BF=CF=85"=20in=20all=20cases=20accented=20or=20unaccented.?= =?UTF-8?q?=20To=20facilitate=20that,=20changed=20remove=5Faccentuation.py?= =?UTF-8?q?=20to=20have=20an=20optional=20input=20that=20skips=20the=20dia?= =?UTF-8?q?iresis=20step?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/greeklt/greek_elot_transliteration.py | 295 ++++++++++------------ src/greeklt/remove_accentuation.py | 7 +- 2 files changed, 133 insertions(+), 169 deletions(-) diff --git a/src/greeklt/greek_elot_transliteration.py b/src/greeklt/greek_elot_transliteration.py index 6eaa6cd..0d0636a 100644 --- a/src/greeklt/greek_elot_transliteration.py +++ b/src/greeklt/greek_elot_transliteration.py @@ -1,128 +1,76 @@ def greek_elot_transliteration(string: str): -# This should all probably be a CSV or something - el_low = [ - "α", - "β", - "γ", - "δ", - "ε", - "ζ", - "η", - "θ", - "ι", - "κ", - "λ", - "μ", - "ν", - "ξ", - "ο", - "π", - "ρ", - "σ", - "τ", - "υ", - "φ", - "χ", - "ψ", - "ω" - ] - el_cap = [ - "Α", - "Β", - "Γ", - "Δ", - "Ε", - "Ζ", - "Η", - "Θ", - "Ι", - "Κ", - "Λ", - "Μ", - "Ν", - "Ξ", - "Ο", - "Π", - "Ρ", - "Σ", - "Τ", - "Υ", - "Φ", - "Χ", - "Ψ", - "Ω" - ] - # TODO FIX Psari, not PSari - eng_low = [ - "a", - "v", - "g", - "d", - "e", - "z", - "i", - "th", - "i", - "k", - "l", - "m", - "n", - "x", - "o", - "p", - "r", - "s", - "t", - "y", - "f", - - "ch", - "ps", - "o" - ] - eng_cap = [ - "A", - "V", - "G", - "D", - "E", - "Z", - "I", - "TH", - "I", - "K", - "L", - "M", - "N", - "X", - "O", - "P", - "R", - "S", - "T", - "Y", - "F", - "CH", - "PS", - "O" - ] + from remove_accentuation import remove_accentuation + reference_string = string + string = remove_accentuation(string, 1) + lowcase = { + 'α': 'a', + 'β': 'v', + 'γ': 'g', + 'δ': 'd', + 'ε': 'e', + 'ζ': 'z', + 'η': 'i', + 'θ': 'th', + 'ι': 'i', + 'κ': 'k', + 'λ': 'l', + 'μ': 'm', + 'ν': 'n', + 'ξ': 'x', + 'ο': 'o', + 'π': 'p', + 'ρ': 'r', + 'σ': 's', + 'τ': 't', + 'υ': 'y', + 'φ': 'f', + 'χ': 'ch', + 'ψ': 'ps', + 'ω': 'o' + } + caps = { + 'Α': 'A', + 'Β': 'V', + 'Γ': 'G', + 'Δ': 'D', + 'Ε': 'E', + 'Ζ': 'Z', + 'Η': 'I', + 'Θ': 'TH', + 'Ι': 'I', + 'Κ': 'K', + 'Λ': 'L', + 'Μ': 'M', + 'Ν': 'N', + 'Ξ': 'X', + 'Ο': 'O', + 'Π': 'P', + 'Ρ': 'R', + 'Σ': 'S', + 'Τ': 'T', + 'Υ': 'Y', + 'Φ': 'F', + 'Χ': 'CH', + 'Ψ': 'PS', + 'Ω': 'O' + } # Simple digraphs with no extra rules or edge cases # No need for mixed casing "Γγ" or "Γξ" against Greek phonology el_simple_digraphs = [ - "γγ", - "γξ", - "γχ" + 'γγ', + 'γξ', + 'γχ' ] - el_simple_cap_digraphs = [ + eng_simple_digraphs = [ + 'ng', + 'nx', + 'nch' + ] + el_simple_cap_digraphs = { "ΓΓ", "ΓΞ", "ΓΧ" - ] - eng_simple_digraphs = [ - "ng", - "nx", - "nch" - ] + } eng_simple_cap_digraphs = [ "NG", "NX", @@ -130,9 +78,9 @@ def greek_elot_transliteration(string: str): ] el_mono_digraph_sub = [ - "Θ", - "Χ", - "Ψ" + "TH", + "CH", + "PS" ] # Accent based digraphs # el_low_acc_digraphs = [ @@ -151,7 +99,7 @@ def greek_elot_transliteration(string: str): # "Αϋ", # # "Έυ", -# "Εϋ", +# "Εϋ", # # "Ήυ", # "Ηϋ" @@ -186,17 +134,18 @@ def greek_elot_transliteration(string: str): "ευ", "ηυ" ] - eng_xu_digraphs = [ + eng_xu_digraphs_v = [ "av", - "af", "ev", + "iv" + ] + eng_xu_digraphs_f = [ + "af", "ef", - "iv", "if" ] -# List related to xu lists xu_sound_modifiers_v = [ -# β, γ, δ, ζ, λ, μ, ν, ρ, α, ε, η, ι, ο, υ, ω + "β", "γ", "δ", @@ -214,8 +163,9 @@ def greek_elot_transliteration(string: str): "υ", "ω" ] +# +empty space (accounted for in code) xu_sound_modifiers_f = [ -# θ, κ, ξ, π, σ, τ, φ, χ, ψ, empty space + "θ", "κ", "ξ", @@ -226,46 +176,59 @@ def greek_elot_transliteration(string: str): "χ", "ψ" ] - print(string) - new_string = string # Replace ς with σ - new_string = new_string.replace("ς","σ") + prep_string = string.replace("ς", "σ") # if el_low_acc_digraphs or el_mix_acc_digraphs or el_cap_acc_digraphs in string: # Do nothing, we don't care with current implementation +# Prepare the Unicode tables for use with translate() + lowcase = string.maketrans(lowcase) + caps = string.maketrans(caps) + reference_string_list = reference_string.split(" ") + new_string_list = prep_string.split(" ") + output = "" + current_iteration = 0 + for new_string in new_string_list: + # Replace all digraphs, so they're ignored by the simple transcription + for i in el_simple_digraphs: + if i in string: + new_string = new_string.replace(i, eng_simple_digraphs[el_simple_digraphs.index(i)]) + for i in el_simple_cap_digraphs: + if i in string: + new_string = new_string.replace(i, eng_simple_cap_digraphs[el_simple_digraphs.index(i)]) +# Check which "mp" sound to use depending on if it's at word start + for i in el_mp_digraph: + if i in string: + if string.startswith(i): + new_string = new_string.replace(i, eng_mp_digraph_0[el_mp_digraph.index(i)], 1) + new_string = new_string.replace(i, eng_mp_digraph_1[el_mp_digraph.index(i)]) +# Check what VOWEL+"υ" should transliterate to depending on the following letter. + for i in el_xu_digraphs: + if i in new_string: + if len(new_string) > 2: # Make sure we're not calling an out of range index + for loop in xu_sound_modifiers_f: + if new_string[new_string.find(i)+2] in loop: + new_string = new_string.replace(i, eng_xu_digraphs_f[el_xu_digraphs.index(i)]) + + for loop in xu_sound_modifiers_v: + if new_string[new_string.find(i)+2] in loop: + new_string = new_string.replace(i, eng_xu_digraphs_v[el_xu_digraphs.index(i)]) + if len(new_string) == 2: # Account for VOWEL+"υ" at end of sentence + new_string = new_string.replace(i, eng_xu_digraphs_f[el_xu_digraphs.index(i)]) + if "ου" in reference_string[current_iteration]: + new_string = new_string.replace("ου", "ou") + if "όυ" or "οϋ" in reference_string[current_iteration]: + new_string = new_string.replace("ου", "oy") + current_iteration += 1 -# Replace all digraphs so they're ignored by the simple transcription - for i in el_simple_digraphs: - if i in string: - print(new_string.replace(i, eng_simple_digraphs[el_simple_digraphs.index(i)])) - new_string = new_string.replace(i, eng_simple_digraphs[el_simple_digraphs.index(i)]) - for i in el_simple_cap_digraphs: - if i in string: - new_string = new_string.replace(i, eng_simple_cap_digraphs[el_simple_digraphs.index(i)]) -# TODO: ROMANIZE ACCENTS - for i in el_mp_digraph: - if i in string: - if string.startswith(i): - new_string = new_string.replace(i, eng_mp_digraph_0[el_mp_digraph.index(i)], 1) - new_string = new_string.replace(i, eng_mp_digraph_1[el_mp_digraph.index(i)]) - for i in el_xu_digraphs: - if i in string: - for loop in xu_sound_modifiers_f: - if string[string.find(i)+1] in loop: - new_string = new_string.replace(i, eng_xu_digraphs[el_xu_digraphs.index(i)+1]) - for loop in xu_sound_modifiers_v: - if string[string.find(i)+1] in loop: - new_string = new_string.replace(i, eng_xu_digraphs[el_xu_digraphs.index(i)]) -# Simple transliteration - for i in el_low: - if i in el_low: - new_string = new_string.replace(i, eng_low[el_low.index(i)]) - for i in el_cap: - if i in el_cap: - new_string = new_string.replace(i, eng_cap[el_cap.index(i)]) - #Normalize capital letters if needed - print(string) - for i in el_mono_digraph_sub: - if string.startswith(i): - if string[2].islower() == True: - new_string = new_string.replace(new_string[1], new_string[1].lower()) - return new_string +# Simple transcription + new_string = new_string.translate(caps) + new_string = new_string.translate(lowcase) +# Normalize capital letters if needed + for i in el_mono_digraph_sub: + if new_string.startswith(i): + if new_string[3].islower() is True: + new_string = new_string.replace(new_string[1], new_string[1].lower()) + new_string += " " + output += new_string + return output +print(greek_elot_transliteration("Με λένε στέλιο και λατρεύω το τρόυ")) \ No newline at end of file diff --git a/src/greeklt/remove_accentuation.py b/src/greeklt/remove_accentuation.py index 978dd54..dfbedfc 100644 --- a/src/greeklt/remove_accentuation.py +++ b/src/greeklt/remove_accentuation.py @@ -1,4 +1,4 @@ -def remove_accentuation(string: str): +def remove_accentuation(string: str, modulus=0): accents = { "ά": "α", "έ": "ε", @@ -23,8 +23,9 @@ def remove_accentuation(string: str): char = c if c in accents.keys(): char = accents[c] - if c in dieresis.keys() and prev_char in ("ά", "ό", "έ"): - char = dieresis[c] + if modulus == 0: + if c in dieresis.keys() and prev_char in ("ά", "ό", "έ"): + char = dieresis[c] prev_char = c new_string += char return new_string From 9954e623f87c1315c7b0f1ff406542b1851f3d6f Mon Sep 17 00:00:00 2001 From: athenionn Date: Thu, 19 Jan 2023 19:48:30 +0200 Subject: [PATCH 7/8] In remove_accentuation.py: Made readability & common sense changes, added features as required for greek_elot_transliteration.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In greek_elot_transliteration.py: Fixed bugs revolving around ου, made some mild readability improvements In README.md: Minor grammar changes, documented changes to remove_accentuation.py as well as usage of greek_elot_transliteration.py In test_tests.py: Added rudimentary tests. Not sure if they're properly implemented. --- README.md | 29 +++++++++++++++--- src/greeklt/greek_elot_transliteration.py | 37 +++++++++++------------ src/greeklt/remove_accentuation.py | 8 +++-- tests/test_tests.py | 9 ++++++ 4 files changed, 57 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index ed8b184..0d17e58 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Import as any other module using the name `greeklt` - **capitalize(word)** -Makes a word all caps without accentuation but adds dieresis when necessary: +Makes a word all caps without accentuation, adding dieresis when necessary: ``` word = "γάιδαρος" @@ -25,7 +25,7 @@ word = "γάιδαρος" ΓΑΪΔΑΡΟΣ ``` -- **remove_accentuation(word)** +- **remove_accentuation(word, modifier=0)** Removes accentuation but adds dieresis when necessary, without capitalizing: @@ -36,7 +36,7 @@ word = "γάιδαρος" γαϊδαρος ``` -Works exceptionally well when you want to sort a list aphabetically and not based on unicode: +Works exceptionally well when you want to sort a list alphabetically and not based on unicode: ``` cities = ["Όσλο", "Λευκωσία", "Άκαμπα", "Ζυρίχη", "Ρώμη"] @@ -47,7 +47,14 @@ cities = ["Όσλο", "Λευκωσία", "Άκαμπα", "Ζυρίχη", "Ρώ >>> sorted(cities, key=remove_accentuation) ["Άκαμπα", "Ζυρίχη", "Λευκωσία", "Όσλο", "Ρώμη"] ``` +Also contains an option to never add dieresis, enabled by setting modifier to 1: +``` +string = "Αιδόνι" + +>>> print(remove_accentuation(string)) +Αιδονι +``` - **convert_final_s(word)** Checks last letter of each word in a string. If it is a `σ` it is converted into a `ς` (final `σ`): @@ -67,6 +74,7 @@ Transliterates a string written with latin characters into it's equivalent Greek ``` This can come quite in handy when a user forgets to change the language and the word looks the same both in latin and Greek: + ``` # ANNA written in latin (Anna) name = "ANNA" @@ -77,4 +85,17 @@ False # Both look the same but are different unnicode characters >>> ANNA == ΑΝΝΑ False -``` \ No newline at end of file +``` + +- **greek_elot_Transliteration(word/sentence)** + +Transliterates a string according to the ELOT 743 standard, frequently used in formal transcription settings (e.g passports, ID, etc) + +``` + +name = "Ελευθέριος Βενιζέλος" + +>>> latin_name = greek_elot_transliteration(name) +>>> print(latin_name) +Eleftherios Venizelos + diff --git a/src/greeklt/greek_elot_transliteration.py b/src/greeklt/greek_elot_transliteration.py index 0d0636a..5539a2d 100644 --- a/src/greeklt/greek_elot_transliteration.py +++ b/src/greeklt/greek_elot_transliteration.py @@ -1,8 +1,8 @@ -def greek_elot_transliteration(string: str): +def greek_elot_transliteration(string): from remove_accentuation import remove_accentuation - reference_string = string +# reference_string = string string = remove_accentuation(string, 1) - lowcase = { + lowercase = { 'α': 'a', 'β': 'v', 'γ': 'g', @@ -59,22 +59,26 @@ def greek_elot_transliteration(string: str): el_simple_digraphs = [ 'γγ', 'γξ', - 'γχ' + 'γχ', + 'ου' ] eng_simple_digraphs = [ 'ng', 'nx', - 'nch' + 'nch', + 'ou' ] - el_simple_cap_digraphs = { + el_simple_cap_digraphs = [ "ΓΓ", "ΓΞ", - "ΓΧ" - } + "ΓΧ", + "ΟΥ" + ] eng_simple_cap_digraphs = [ "NG", "NX", - "NCH" + "NCH", + "OU" ] el_mono_digraph_sub = [ @@ -181,12 +185,11 @@ def greek_elot_transliteration(string: str): # if el_low_acc_digraphs or el_mix_acc_digraphs or el_cap_acc_digraphs in string: # Do nothing, we don't care with current implementation # Prepare the Unicode tables for use with translate() - lowcase = string.maketrans(lowcase) + lowercase = string.maketrans(lowercase) caps = string.maketrans(caps) - reference_string_list = reference_string.split(" ") +# reference_string_list = reference_string.split(" ") new_string_list = prep_string.split(" ") output = "" - current_iteration = 0 for new_string in new_string_list: # Replace all digraphs, so they're ignored by the simple transcription for i in el_simple_digraphs: @@ -194,7 +197,7 @@ def greek_elot_transliteration(string: str): new_string = new_string.replace(i, eng_simple_digraphs[el_simple_digraphs.index(i)]) for i in el_simple_cap_digraphs: if i in string: - new_string = new_string.replace(i, eng_simple_cap_digraphs[el_simple_digraphs.index(i)]) + new_string = new_string.replace(i, eng_simple_cap_digraphs[el_simple_cap_digraphs.index(i)]) # Check which "mp" sound to use depending on if it's at word start for i in el_mp_digraph: if i in string: @@ -214,15 +217,10 @@ def greek_elot_transliteration(string: str): new_string = new_string.replace(i, eng_xu_digraphs_v[el_xu_digraphs.index(i)]) if len(new_string) == 2: # Account for VOWEL+"υ" at end of sentence new_string = new_string.replace(i, eng_xu_digraphs_f[el_xu_digraphs.index(i)]) - if "ου" in reference_string[current_iteration]: - new_string = new_string.replace("ου", "ou") - if "όυ" or "οϋ" in reference_string[current_iteration]: - new_string = new_string.replace("ου", "oy") - current_iteration += 1 # Simple transcription new_string = new_string.translate(caps) - new_string = new_string.translate(lowcase) + new_string = new_string.translate(lowercase) # Normalize capital letters if needed for i in el_mono_digraph_sub: if new_string.startswith(i): @@ -231,4 +229,3 @@ def greek_elot_transliteration(string: str): new_string += " " output += new_string return output -print(greek_elot_transliteration("Με λένε στέλιο και λατρεύω το τρόυ")) \ No newline at end of file diff --git a/src/greeklt/remove_accentuation.py b/src/greeklt/remove_accentuation.py index dfbedfc..05d7eaa 100644 --- a/src/greeklt/remove_accentuation.py +++ b/src/greeklt/remove_accentuation.py @@ -1,4 +1,4 @@ -def remove_accentuation(string: str, modulus=0): +def remove_accentuation(string: str, modifier=0): accents = { "ά": "α", "έ": "ε", @@ -17,15 +17,19 @@ def remove_accentuation(string: str, modulus=0): "Ώ": "Ω", } dieresis = {"ι": "ϊ", "υ": "ϋ"} + dieresis_reverse = {"ϊ": "ι", "ϋ": "υ"} new_string = "" prev_char = 0 for c in string: char = c if c in accents.keys(): char = accents[c] - if modulus == 0: + if modifier == 0: if c in dieresis.keys() and prev_char in ("ά", "ό", "έ"): char = dieresis[c] + if modifier == 1: # Remove dieresis + if c in dieresis_reverse.keys(): + char = dieresis_reverse[c] prev_char = c new_string += char return new_string diff --git a/tests/test_tests.py b/tests/test_tests.py index cf4758c..915f779 100644 --- a/tests/test_tests.py +++ b/tests/test_tests.py @@ -2,6 +2,7 @@ from src.greeklt.remove_accentuation import remove_accentuation from src.greeklt.greek_transliteration import greek_transliteration from src.greeklt.convert_final_s import convert_final_s +from src.greeklt.greek_elot_transliteration import greek_elot_transliteration def test_capitalize(): @@ -28,3 +29,11 @@ def test_transilteration(): def test_convert_final_s(): assert convert_final_s("Φάροσ ΦΑΡΟΣ φάρος") == "Φάρος ΦΑΡΟΣ φάρος" + + +def test_elot_transliteration(): + + assert greek_elot_transliteration("αυγό") == "avgo", "Failure in vowel+υ conversion, v" + assert greek_elot_transliteration("αυτο") == "afto", "Failure in vowel+υ conversion, f" + assert greek_elot_transliteration("αγγελος το αγχος του") == "angelos to anchos tou", "Failure in simple diphthongs" + assert greek_elot_transliteration("Θανασης") == "Thanasis", "Failure in diphthong capital normalization" From da43311fd15c54b4770e2eaf558b2d10819e5af7 Mon Sep 17 00:00:00 2001 From: athenionn Date: Thu, 19 Jan 2023 19:55:56 +0200 Subject: [PATCH 8/8] Fixed merge conflicts in a very weird way. I need to get better with git. --- README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/README.md b/README.md index 0d17e58..3bcc038 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,27 @@ False False ``` +There's also the abillity to convert a word from latin to it's intended accentuated once in Greek: + +``` +>>> greek_transliteration("P;ita soybl;aki") +Πίτα σουβλάκι +>>> greek_transliteration("kaWiki") +καΐκι +>>> greek_transliteration("pro:yp;ouesh") +προϋπόθεση +>>> greek_transliteration("GA:IDAROS") +ΓΑΪΔΑΡΟΣ +``` +Note: The function takes as given that the user intended to write the work in Greek using the correct key sequence but just didn't switch their keyboard to Greek. It doesn't convert from Greeklish! + +``` +# Wrong key sequence by user. +# They're supposed to press SHIFT + W and not just w for the ΅ character to appear. +>>> greek_transliteration("kawiki") +καςικι +``` + - **greek_elot_Transliteration(word/sentence)** Transliterates a string according to the ELOT 743 standard, frequently used in formal transcription settings (e.g passports, ID, etc) @@ -99,3 +120,4 @@ name = "Ελευθέριος Βενιζέλος" >>> print(latin_name) Eleftherios Venizelos +``` \ No newline at end of file