Skip to content

Commit

Permalink
implement incremental translit
Browse files Browse the repository at this point in the history
  • Loading branch information
kdarkhan committed Dec 8, 2017
1 parent 0907259 commit 17b0c49
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 16 deletions.
9 changes: 6 additions & 3 deletions shared/src/main/scala/translit/Language.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@ package translit
trait Language {
def latinToCyrillicOfs(text: String,
offset: Int,
apostrophes: Boolean = true): (Int, Char)
apostrophes: Boolean = true,
incrementalTranslit: Boolean = false): (Int, Char)

def latinToCyrillic(text: String, apostrophes: Boolean = true): String = {
def latinToCyrillic(text: String,
apostrophes: Boolean = true,
incrementalTranslit: Boolean = false): String = {
val result = new StringBuilder(text.length)
var offset = 0

while (offset < text.length) {
val (length, c) = latinToCyrillicOfs(text, offset, apostrophes)
val (length, c) = latinToCyrillicOfs(text, offset, apostrophes, incrementalTranslit)
if (length < 0) result.setLength(result.length + length)
result.append(c)
offset += 1
Expand Down
32 changes: 27 additions & 5 deletions shared/src/main/scala/translit/Russian.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package translit
import translit.Helpers._

object Russian extends Language {

val uniGrams = Map(
'a' -> 'а',
'b' -> 'б',
Expand Down Expand Up @@ -43,9 +44,21 @@ object Russian extends Language {
"yu" -> 'ю'
)

val biGramsIncremental = getIncrementalNgram(biGrams)

val triGrams = Map.empty[String, Char]
val triGramsIncremental = Map(
"шцh" -> 'щ'
)

val fourGrams = Map(
"shch" -> 'щ'
)
val fourGramsIncremental = Map.empty[String, Char]

def getIncrementalNgram(ngram: Map[String, Char]): Map[String, Char] = ngram ++ ngram.map { case (prefix, value) =>
(latinToCyrillic(prefix.slice(0, prefix.length - 1), incrementalTranslit = true) + prefix.last, value)
}

/**
* Converts one character starting from `offset`
Expand All @@ -56,17 +69,26 @@ object Russian extends Language {
*/
def latinToCyrillicOfs(text: String,
offset: Int,
apostrophes: Boolean = true): (Int, Char) = {
apostrophes: Boolean = true,
incrementalTranslit: Boolean = false): (Int, Char) = {
val (biGramsL, triGramsL, fourGramsL) =
if (incrementalTranslit) (biGramsIncremental, triGramsIncremental, fourGramsIncremental)
else (biGrams, triGrams, fourGrams)
val ofs = offset + 1
if (ofs >= 4 &&
fourGrams.contains(text.substring(ofs - 4, ofs).toLowerCase)) {
fourGramsL.contains(text.substring(ofs - 4, ofs).toLowerCase)) {
val chars = text.substring(ofs - 4, ofs)
val cyrillic = fourGrams(chars.toLowerCase)
val cyrillic = fourGramsL(chars.toLowerCase)
(-2, restoreCaseFirst(chars, cyrillic))
} else if (ofs >= 3 &&
triGramsL.contains(text.substring(ofs - 3, ofs).toLowerCase)) {
val chars = text.substring(ofs - 3, ofs)
val cyrillic = triGramsL(chars.toLowerCase)
(-2, restoreCaseFirst(chars, cyrillic))
} else if (ofs >= 2 &&
biGrams.contains(text.substring(ofs - 2, ofs).toLowerCase)) {
biGramsL.contains(text.substring(ofs - 2, ofs).toLowerCase)) {
val chars = text.substring(ofs - 2, ofs)
val cyrillic = biGrams(chars.toLowerCase)
val cyrillic = biGramsL(chars.toLowerCase)
(-1, restoreCaseFirst(chars, cyrillic))
} else if (uniGrams.contains(text(ofs - 1).toLower)) {
val cyrillic = uniGrams(text(ofs - 1).toLower)
Expand Down
30 changes: 22 additions & 8 deletions shared/src/main/scala/translit/Ukrainian.scala
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,20 @@ object Ukrainian extends Language {
"zh" -> 'ж'
)

val biGramsIncremental = getIncrementalNgram(biGrams)

val triGrams = Map(
"zgh" -> 'г'
"zgh" -> 'г',
)

val triGramsIncremental = getIncrementalNgram(triGrams) + ("шцh" -> 'щ')

val fourGrams = Map(
"shch" -> 'щ'
)

val fourGramIncremental = Map.empty[String, Char]

val apostrophePatterns = Set(
('b', "ya"),
('b', "ye"),
Expand Down Expand Up @@ -84,6 +90,10 @@ object Ukrainian extends Language {
('z', "yi")
)

def getIncrementalNgram(ngram: Map[String, Char]): Map[String, Char] = ngram ++ ngram.map { case (prefix, value) =>
(latinToCyrillic(prefix.slice(0, prefix.length - 1), incrementalTranslit = true) + prefix.last, value)
}

/**
* Converts one character starting from `offset`
*
Expand All @@ -93,25 +103,29 @@ object Ukrainian extends Language {
*/
def latinToCyrillicOfs(text: String,
offset: Int,
apostrophes: Boolean = true): (Int, Char) = {
apostrophes: Boolean = true,
incrementalTranslit: Boolean = false): (Int, Char) = {
val (biGramsL, triGramsL, fourGramsL) =
if (incrementalTranslit) (biGramsIncremental, triGramsIncremental, fourGramIncremental)
else (biGrams, triGrams, fourGrams)
val ofs = offset + 1
if (ofs >= 4 &&
fourGrams.contains(text.substring(ofs - 4, ofs).toLowerCase)
fourGramsL.contains(text.substring(ofs - 4, ofs).toLowerCase)
) {
val chars = text.substring(ofs - 4, ofs)
val cyrillic = fourGrams(chars.toLowerCase)
val cyrillic = fourGramsL(chars.toLowerCase)
(-2, restoreCaseFirst(chars, cyrillic))
} else if (ofs >= 3 &&
triGrams.contains(text.substring(ofs - 3, ofs).toLowerCase)
triGramsL.contains(text.substring(ofs - 3, ofs).toLowerCase)
) {
val chars = text.substring(ofs - 3, ofs)
val cyrillic = triGrams(chars.toLowerCase)
val cyrillic = triGramsL(chars.toLowerCase)
(-1, restoreCaseAll(chars, cyrillic))
} else if (ofs >= 2 &&
biGrams.contains(text.substring(ofs - 2, ofs).toLowerCase)
biGramsL.contains(text.substring(ofs - 2, ofs).toLowerCase)
) {
val chars = text.substring(ofs - 2, ofs)
val cyrillic = biGrams(chars.toLowerCase)
val cyrillic = biGramsL(chars.toLowerCase)
(-1, restoreCaseFirst(chars, cyrillic))
} else if (uniGrams.contains(text(ofs - 1).toLower)) {
val cyrillic = uniGrams(text(ofs - 1).toLower)
Expand Down
8 changes: 8 additions & 0 deletions shared/src/test/scala/translit/RussianSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,12 @@ class RussianSpec extends FunSuite {
cyrillic)
}
}

test("Incremental translit") {
assert(Russian.latinToCyrillic("peсhkom", incrementalTranslit = true) == "пешком")
assert(Russian.latinToCyrillic("зhizn'", incrementalTranslit = true) == "жизнь")
assert(Russian.latinToCyrillic("zhizn'", incrementalTranslit = true) == "жизнь")
assert(Russian.latinToCyrillic("багазh", incrementalTranslit = true) == "багаж")
assert(Russian.latinToCyrillic("шцhetka", incrementalTranslit = true) == "щетка")
}
}

0 comments on commit 17b0c49

Please sign in to comment.