Skip to content

Commit

Permalink
Russian: update rules and cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
kdarkhan committed Dec 6, 2017
1 parent eff67f0 commit c8fc20e
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 64 deletions.
78 changes: 25 additions & 53 deletions shared/src/main/scala/translit/Russian.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ object Russian extends Language {
val uniGrams = Map(
'a' -> 'а',
'b' -> 'б',
'v' -> 'в',
'g' -> 'г',
'c' -> 'ц',
'd' -> 'д',
'e' -> 'е',
'z' -> 'з',
'f' -> 'ф',
'g' -> 'г',
'h' -> 'х',
'i' -> 'и',
'j' -> 'й',
'k' -> 'к',
Expand All @@ -19,64 +20,33 @@ object Russian extends Language {
'n' -> 'н',
'o' -> 'о',
'p' -> 'п',
'q' -> 'щ',
'r' -> 'р',
's' -> 'с',
't' -> 'т',
'u' -> 'у',
'f' -> 'ф',
'v' -> 'в',
'w' -> 'ш',
'x' -> 'х',
'h' -> 'х',
'c' -> 'ц',
'w' -> 'щ',
'#' -> 'ъ',
'y' -> 'ы'
'y' -> 'ы',
'z' -> 'з',
'\"' -> 'ъ'
)

val biGrams = Map(
"jo" -> 'ё',
"yo" -> 'ё',
"zh" -> 'ж',
"ch" -> 'ч',
"sh" -> 'ш',
"ye" -> 'э',
"yu" -> 'ю',
"ju" -> 'ю',
"ya" -> 'я',
"ja" -> 'я'
)

val triGrams = Map(
"shh" -> 'щ'
"ye" -> 'э',
"zh" -> 'ж',
"yo" -> 'ё',
"yu" -> 'ю'
)

// tried to use prefix rules but there are many exceptions in Russian language
// Ex.: фольклор, пальцем
val apostropheSuffix = Set(
"ya",
"ja",
"yo",
"jo",
"i",
"e",
"yu",
"yu",
"",
)
val triGrams = Map.empty[String, Char]

val apostrophePrefix = Set(
"b",
"v",
"d",
"z",
"k",
"l",
"m",
"n",
"p",
"r",
"c",
"t",
"sh"
val fourGrams = Map(
"shch" -> 'щ'
)

/**
Expand All @@ -90,8 +60,13 @@ object Russian extends Language {
offset: Int,
apostrophes: Boolean = true): (Int, Char) = {
val ofs = offset + 1
if (ofs >= 3 &&
triGrams.contains(text.substring(ofs - 3, ofs).toLowerCase)) {
if (ofs >= 4 &&
fourGrams.contains(text.substring(ofs - 4, ofs).toLowerCase)) {
val chars = text.substring(ofs - 4, ofs)
val cyrillic = fourGrams(chars.toLowerCase)
(-2, restoreCaseFirst(chars, cyrillic))
} else if (ofs >= 3 &&
triGrams.contains(text.substring(ofs - 3, ofs).toLowerCase)) {
val chars = text.substring(ofs - 3, ofs)
val cyrillic = triGrams(chars.toLowerCase)
(-1, restoreCaseAll(chars, cyrillic))
Expand All @@ -103,10 +78,7 @@ object Russian extends Language {
} else if (uniGrams.contains(text(ofs - 1).toLower)) {
val cyrillic = uniGrams(text(ofs - 1).toLower)
(0, if (text(ofs - 1).isUpper) cyrillic.toUpper else cyrillic)
} else if (text(ofs - 1) == '\'' && apostrophes && (
apostrophePrefix.contains(text.slice(ofs - 3, ofs - 1)) ||
apostrophePrefix.contains(text.slice(ofs - 2, ofs - 1))
)) {
} else if (text(ofs - 1) == '\'' && apostrophes) {
if (text(ofs - 2).isUpper) (0, 'Ь') else (0, 'ь')
} else {
(0, text(ofs - 1))
Expand Down
16 changes: 5 additions & 11 deletions shared/src/test/scala/translit/RussianSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,21 @@ class RussianSpec extends FunSuite {
"Борис" -> "Boris",
"Валера" -> "Valera",
"гвоздь" -> "gvozd'",
"днище" -> "dnishhe",
"днище" -> "dnishche",
"Емеля" -> "Emelya",
"ёлка" -> "yolka",
"ёлка" -> "jolka",
"железо" -> "zhelezo",
"зыбь" -> "zyb'",
"Ильин" -> "Il'in",
"Йемен" -> "Jemen",
"киянка" -> "kiyanka",
"лещ" -> "leshh",
"лещ" -> "leshch",
"мышьяк" -> "mysh'yak",
"Новгород" -> "Novgorod",
"овраг" -> "ovrag",
"пьянство" -> "p'yanstvo",
"роща" -> "roshha",
"съел" -> "s#el",
"роща" -> "roshcha",
"съел" -> "s\"el",
"тележка" -> "telezhka",
"ухват" -> "uxvat",
"ухват" -> "uhvat",
Expand All @@ -32,17 +31,12 @@ class RussianSpec extends FunSuite {
"цвет" -> "cvet",
"червь" -> "cherv'",
"швея" -> "shveya",
"щавель" -> "shhavel'",
"щавель" -> "shchavel'",
"электровоз" -> "yelektrovoz",
"юла" -> "yula",
"ягненок" -> "yagnenok",
)

def removeApostropheAndSoftSign(str: String): String =
str
.replaceAll("ь", "")
.replaceAll("'", "")

correctMapping.foreach {
case (cyrillic, latin) =>
test(s"$latin -> $cyrillic") {
Expand Down

0 comments on commit c8fc20e

Please sign in to comment.