Skip to content
/ tha Public

πŸ“’ Tha (ថអ) - A Khmer Text Normalization and Verbalization Toolkit

License

Notifications You must be signed in to change notification settings

seanghay/tha

Folders and files

NameName
Last commit message
Last commit date

Latest commit

Β 

History

14 Commits
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 
Β 

Repository files navigation

Tha (ថអ)

Khmer Text Normalization and Verbalization Toolkit.

Install

pip install tha
import tha.normalize
import tha.phone_numbers
import tha.urls
import tha.datetime
import tha.hashtags
import tha.ascii_lines
import tha.license_plate
import tha.cardinals
import tha.decimals
import tha.ordinals
import tha.currency
import tha.parenthesis
import tha.repeater

## Normalize
assert tha.normalize.processor("αž˜αž·αž“\u200bαž²αŸ’αž™") == "αž˜αž·αž“αž±αŸ’αž™"

## Phone Numbers
assert tha.phone_numbers.processor("010123123", chunk_size=2) == "0▁10▁12▁31▁23"
assert tha.phone_numbers.processor("010123123", chunk_size=3) == "0▁10▁123▁123"
assert tha.phone_numbers.processor("0961231234", chunk_size=3) == "0▁96▁123▁1234"

## URLs and emails
assert tha.urls.processor("example@gmail.com") == "example at g▁mail dot com"
assert tha.urls.processor("https://google.com") == "google dot com"
assert tha.urls.processor("http://google.com") == "google dot com"
assert tha.urls.processor("google.com") == "google dot com"
assert tha.urls.processor("google.gov.kh") == "google dot gov dot k▁h"
assert tha.urls.processor("google.com.kh") == "google dot com dot k▁h"

## Time
assert tha.datetime.time_processor("10:23AM") == "10 23▁A▁M"
assert tha.datetime.time_processor("10:23PM") == "10 23▁P▁M"
assert tha.datetime.time_processor("1:23PM") == "1 23▁P▁M"

## Date
assert tha.datetime.date_processor("2024-01-02") == "2024 01 02"
assert tha.datetime.date_processor("01-02-2034") == "01 02 2034"

## Hashtags
assert (
  tha.hashtags.processor("Hello world #this_will_remove hello") == "Hello world  hello"
)
assert tha.hashtags.processor("Hello world #αž›αž»αž” hello") == "Hello world  hello"
assert tha.hashtags.processor("Hello world #αž›αž»αž”1234 hello") == "Hello world  hello"

## ASCII Lines
assert tha.ascii_lines.processor("Remove --- asdasd") == "Remove  asdasd"
assert tha.ascii_lines.processor("Remove\n###\nasdasd") == "Remove\n\nasdasd"

## Cambodia License Plate
assert tha.license_plate.processor("1A 1234") == "1 A 12▁34"
assert tha.license_plate.processor("1A 4444") == "1 A αž€αžΆαžšαŸ‰αŸ4"

## Number - Cardinals
assert tha.cardinals.processor("1234") == "αž˜αž½αž™αž–αžΆαž“αŸ‹β–αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"
assert tha.cardinals.processor("1") == "αž˜αž½αž™"
assert tha.cardinals.processor("1▁2") == "αž˜αž½αž™β–αž–αžΈαžš"
assert tha.cardinals.processor("-1") == "αžŠαž€β–αž˜αž½αž™"
assert tha.cardinals.processor("10") == "αžŠαž”αŸ‹"
assert tha.cardinals.processor("15") == "αžŠαž”αŸ‹αž”αŸ’αžšαžΆαŸ†"
assert tha.cardinals.processor("100") == "αž˜αž½αž™αžšαž™"
assert tha.cardinals.processor("10000") == "αž˜αž½αž™αž˜αŸ‰αžΊαž“"
assert tha.cardinals.processor("10000.234") == "αž˜αž½αž™αž˜αŸ‰αžΊαž“.αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"
assert tha.cardinals.processor("-10000.234") == "αžŠαž€β–αž˜αž½αž™αž˜αŸ‰αžΊαž“.αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"
assert tha.cardinals.processor("-10000,234") == "αžŠαž€β–αž˜αž½αž™αž˜αŸ‰αžΊαž“,αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"

## Number - Decimals
assert tha.decimals.processor("123.324") == "αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž…αž»αž…β–αž”αžΈαžšαž™β–αž˜αŸ’αž—αŸƒαž”αž½αž“"
assert tha.decimals.processor("123.001") == "αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž…αž»αž…β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αž˜αž½αž™"
assert tha.decimals.processor("-123.0012") == "αžŠαž€β–αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž…αž»αž…β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αžŠαž”αŸ‹αž–αžΈαžš"
assert tha.decimals.processor("-123,0012") == "αžŠαž€β–αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž€αŸ’αž”αŸ€αžŸβ–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αžŠαž”αŸ‹αž–αžΈαžš"

## Number - Ordinals
assert tha.ordinals.processor("5th") == "αž‘αžΈβ–αž”αŸ’αžšαžΆαŸ†"
assert tha.ordinals.processor("3rd") == "αž‘αžΈβ–αž”αžΈ"
assert tha.ordinals.processor("1st") == "αž‘αžΈβ–αž˜αž½αž™"
assert tha.ordinals.processor("10th") == "αž‘αžΈβ–αžŠαž”αŸ‹"
assert tha.ordinals.processor("10") == "10"

## Number - Currency
assert tha.currency.processor("$100.01") == "αž˜αž½αž™αžšαž™αžŠαž»αž›αŸ’αž›αžΆαžšβ–αž˜αž½αž™αžŸαŸαž“"
assert tha.currency.processor("$100") == "αž˜αž½αž™αžšαž™β–αžŠαž»αž›αŸ’αž›αžΆαžš"
assert tha.currency.processor("100$") == "αž˜αž½αž™αžšαž™αžŠαž»αž›αŸ’αž›αžΆαžš"
assert tha.currency.processor("100αŸ›") == "αž˜αž½αž™αžšαž™αžšαŸ€αž›"
assert tha.currency.processor("100.32αŸ›") == "αž˜αž½αž™αžšαž™β–αž…αž»αž…β–αžŸαžΆαž˜αžŸαž·αž”αž–αžΈαžšαžšαŸ€αž›"
assert tha.currency.processor("100.0032αŸ›") == "αž˜αž½αž™αžšαž™β–αž…αž»αž…β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΆαž˜αžŸαž·αž”αž–αžΈαžšαžšαŸ€αž›"

## Parenthesis
assert tha.parenthesis.processor("Hello (this will be ignored) world") == "Hello world"


## Iteration Mark
def fake_tokenizer(_):
  return ["αž‚αžΆαžαŸ‹", "αž”αžΆαž“", "αž‘αŸ…", "αž”αž“αŸ’αžαž·αž…", "αž˜αŸ’αžŠαž„"]


assert (
  tha.repeater.processor("αž‚αžΆαžαŸ‹αž”αžΆαž“αž‘αŸ…αž”αž“αŸ’αžαž·αž…αž˜αŸ’αžŠαž„αŸ—αž αžΎαž™", tokenizer=fake_tokenizer)
  == "αž‚αžΆαžαŸ‹αž”αžΆαž“αž‘αŸ…αž”αž“αŸ’αžαž·αž…αž˜αŸ’αžŠαž„β–αž”αž“αŸ’αžαž·αž…αž˜αŸ’αžŠαž„αž αžΎαž™"
)

About

πŸ“’ Tha (ថអ) - A Khmer Text Normalization and Verbalization Toolkit

Topics

Resources

License

Stars

Watchers

Forks

Releases

No releases published