# Generate image dataset for Khmer text recognition

1. Install Required Libraries

    brew install freetype harfbuzz fribidi
2. Installing pillow

    pip install pillow --no-binary :all:
3. Verify raqm Support (Return "True" if support)

In [2]:
from PIL import features
print(features.check('raqm'))

True


# 1. Loading the Data

## 1.1. Loading words data

In [6]:
import pandas as pd

# Reading words from Chuon Nath Dictionary
source_cn = pd.read_csv("source/cn.csv", encoding="utf-8", header=None, skiprows=19)
source_cn.columns = ["word", "description"]
words_cn = pd.DataFrame(source_cn["word"], columns=["word"])
words_cn["category"]="Chuon Nath"
desc_cn = pd.DataFrame(source_cn["description"], columns=["description"])
desc_cn = pd.DataFrame(desc_cn["description"].str.replace(r'<.*?>|[a-zA-Z]', '', regex=True))

# Reading from SBBIC
##- SBBIC stands for Society for Better Books in Cambodia.
##- SBBIC List of Khmer Word using in Khmer Spelling Checker - SBBIC Version
words_sbbic = pd.read_csv("source/SBBICkm_KH.txt", encoding="utf-8", header=None, skiprows=1)
words_sbbic.columns = ["word"]
words_sbbic["category"] = "SBBIC"

## Merge the words from Chuon Nath and SBBIC
data_words = pd.concat([words_cn,words_sbbic])

# Reading Geographic locations from Cambodia Gazetteer
## Url: https://data.opendevelopmentcambodia.net/en/dataset/cambodia-gazetteer
## The Cambodia Gazetteer is an official and comprehensive reference database that provides detailed information 
## about Cambodia’s administrative divisions and geographic locations. 
## It is commonly used for geographic and demographic research, policy planning, and spatial analysis. Here’s a detailed explanation
source_admin = pd.read_excel("source/AdministrativeData.xlsx")
geo_gazetteer = pd.DataFrame(source_admin["word"])
geo_gazetteer["category"]="Geographic Locations"

# Merge the words from Chuon Nath, SBBIC, Geographic Locations
# data_words = pd.concat([words_cn,words_sbbic,geo_gazetteer])
# Chuon Nath + Geographic Locations
data_words = pd.concat([words_cn,geo_gazetteer])
data_dict = {
    "Chuon Nath":len(words_cn),
    "SBBIC":len(words_sbbic),
    "Geographic Locations":len(geo_gazetteer),
    "Data Total":len(data_words)
}
print(data_dict)

{'Chuon Nath': 18947, 'SBBIC': 76894, 'Geographic Locations': 16457, 'Data Total': 35404}


## 1.2. Loading sentences data

In [7]:
import pandas as pd
import random
import re
# Read content from a text file and split into a list
file_path = "source/kh-search-QNA-10000.txt"  # Replace with your file path

try:
    # Open and read the file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()  # Read the entire content of the file

    # Use regex to split the text while keeping "។" as a part of the result
    split_content = re.split(r'(\s+|។)', content)  # Splits by spaces or "។" and keeps them
    split_content = [item.strip() for item in split_content if item.strip()]  # Clean empty items
    sentenses = pd.DataFrame(split_content, columns=['word'])
    data_sentences = sentenses[sentenses['word'].apply(lambda x: 20 <= len(x) <= 25)]
except FileNotFoundError:
    print(f"File not found: {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")

data_sentences.reset_index(drop=True, inplace=True)
data_sentences['word'] = data_sentences['word'].apply(lambda x: x + '។' if random.choice([False] * 8 + [True] * 2) else x)
data_sentences["category"]="Sentence"

data_dict = {
    "Sentences":len(data_sentences)
}
print(data_dict)

{'Sentences': 13421}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sentences['word'] = data_sentences['word'].apply(lambda x: x + '។' if random.choice([False] * 8 + [True] * 2) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sentences["category"]="Sentence"


## 1.3. Merge all Words and Sentences Data

In [8]:
data = pd.concat([data_words,data_sentences])
data.reset_index(drop=True, inplace=True)

data_dict = {
    "Chuon Nath":len(words_cn),
    "SBBIC":len(words_sbbic),
    "Geographic Locations":len(geo_gazetteer),
    "Sentences":len(data_sentences),
    "All Data:":len(data)
}
print(data_dict)

{'Chuon Nath': 18947, 'SBBIC': 76894, 'Geographic Locations': 16457, 'Sentences': 13421, 'All Data:': 48825}


# 2. Generate text to image

## 2.1. Import function

In [9]:
from cls_global import gen_khmer_text_image

## 2.2. Define Variant values for Function Parameters

In [10]:
# Font Variants
## Font name parameters
fonts=["KhmerFont/KhmerOS.ttf",
       "KhmerFont/KhmerOSsys.ttf",
       "KhmerFont/KhmerOScontent.ttf",
       "KhmerFont/KhmerOSmuol.ttf",
       "KhmerFont/KhmerOSmuollight.ttf",
       "KhmerFont/KhmerOSbattambang.ttf",
       "KhmerFont/KhmerOSbokor.ttf",
       "KhmerFont/KhmerOSfasthand.ttf",
       "KhmerFont/Kantumruy-Regular.ttf",
       "KhmerFont/Dangrek.ttf"
       ]
## Font sizes parameters
font_sizes=[
       12,
       16
]

# Background Variants
## White: (255, 255, 255, 255),
## Plain light yellow paper: (250,249,247,255),
## Light grey background: (240, 240, 240, 255),
## Brownish paper background   (245, 222, 179, 255)
bg_colors = [
    (255,255,255,255),
    (250,249,247,255),
    (240,240,240,255),
    (245,222,179,255)
]
noise_levels=["low", "medium", "high", "none"]
## Blur levels
## 0: no blur
## 2: Mild blur
## 3: Moderate blur
## 4: Strong blur
blur_levels=[0,2,3,4]

## 2.3 Splitting The Dataset: Train, Validation, Test

In [11]:
from sklearn.model_selection import train_test_split
train_valid, test = train_test_split(
    data, 
    test_size=0.2, 
    stratify=data["category"], 
    random_state=42
)
train, valid = train_test_split(
    train_valid, 
    test_size=0.1, 
    stratify=train_valid["category"], 
    random_state=42
)

In [12]:
train["category"].value_counts()/len(train)

category
Chuon Nath              0.388036
Geographic Locations    0.337088
Sentence                0.274876
Name: count, dtype: float64

In [13]:
valid["category"].value_counts() / len(valid)

category
Chuon Nath              0.388121
Geographic Locations    0.336918
Sentence                0.274962
Name: count, dtype: float64

In [14]:
test["category"].value_counts()/len(test)

category
Chuon Nath              0.388121
Geographic Locations    0.337020
Sentence                0.274859
Name: count, dtype: float64

In [15]:
print(train)

                             word              category
20306                  កំពង់អំពិល  Geographic Locations
36480      ក្នុងសម័យវេទដើមបង្អស់។              Sentence
46846   ក្នុងការចូលជាសមាជិកអាស៊ាន              Sentence
29872                   បឹងប្រីយ៍  Geographic Locations
29404                   អន្លង់ចារ  Geographic Locations
...                           ...                   ...
37148  ​មាន​ផ្ទៃពោះ​ជិត​គ្រប់​ខែ។              Sentence
24291                  វាលព្រីងលើ  Geographic Locations
15267                     អធិការី            Chuon Nath
20647                     ស្វាយពក  Geographic Locations
9421                     រត្តិភាគ            Chuon Nath

[35154 rows x 2 columns]


## 2.4 Generating Text to Images

In [16]:
# Generating training data to image
data_folder = "data_v1"
i = 1
n = len(train)
for index, row in train.iterrows():
    font_size = random.choice(font_sizes)
    font = random.choice(fonts)
    bg = random.choice(bg_colors)
    noise_level = random.choice(noise_levels)
    blur_level = random.choice(blur_levels)
    gen_khmer_text_image(
        index=index+1, 
        content=row["word"],
        data_type="train", 
        bg=bg, 
        noise_level=noise_level, 
        blur_level=blur_level,
        font_path=font, 
        font_size=font_size,
        data_folder=data_folder
    )
    print(f"{i} of {n}: complete")
    i = i+1

1 of 35154: complete
2 of 35154: complete
3 of 35154: complete
4 of 35154: complete
5 of 35154: complete
6 of 35154: complete
7 of 35154: complete
8 of 35154: complete
9 of 35154: complete
10 of 35154: complete
11 of 35154: complete
12 of 35154: complete
13 of 35154: complete
14 of 35154: complete
15 of 35154: complete
16 of 35154: complete
17 of 35154: complete
18 of 35154: complete
19 of 35154: complete
20 of 35154: complete
21 of 35154: complete
22 of 35154: complete
23 of 35154: complete
24 of 35154: complete
25 of 35154: complete
26 of 35154: complete
27 of 35154: complete
28 of 35154: complete
29 of 35154: complete
30 of 35154: complete
31 of 35154: complete
32 of 35154: complete
33 of 35154: complete
34 of 35154: complete
35 of 35154: complete
36 of 35154: complete
37 of 35154: complete
38 of 35154: complete
39 of 35154: complete
40 of 35154: complete
41 of 35154: complete
42 of 35154: complete
43 of 35154: complete
44 of 35154: complete
45 of 35154: complete
46 of 35154: comple

In [17]:
# Generating validation data to image
i = 1
n = len(valid)
for index, row in valid.iterrows():
    font_size = random.choice(font_sizes)
    font = random.choice(fonts)
    bg = random.choice(bg_colors)
    noise_level = random.choice(noise_levels)
    blur_level = random.choice(blur_levels)
    gen_khmer_text_image(
        index=index+1, 
        content=row["word"],
        data_type="valid", 
        bg=bg, 
        noise_level=noise_level, 
        blur_level=blur_level,
        font_path=font, 
        font_size=font_size,
        data_folder=data_folder
    )
    print(f"{i} of {n}: complete")
    i = i +1

1 of 3906: complete
2 of 3906: complete
3 of 3906: complete
4 of 3906: complete
5 of 3906: complete
6 of 3906: complete
7 of 3906: complete
8 of 3906: complete
9 of 3906: complete
10 of 3906: complete
11 of 3906: complete
12 of 3906: complete
13 of 3906: complete
14 of 3906: complete
15 of 3906: complete
16 of 3906: complete
17 of 3906: complete
18 of 3906: complete
19 of 3906: complete
20 of 3906: complete
21 of 3906: complete
22 of 3906: complete
23 of 3906: complete
24 of 3906: complete
25 of 3906: complete
26 of 3906: complete
27 of 3906: complete
28 of 3906: complete
29 of 3906: complete
30 of 3906: complete
31 of 3906: complete
32 of 3906: complete
33 of 3906: complete
34 of 3906: complete
35 of 3906: complete
36 of 3906: complete
37 of 3906: complete
38 of 3906: complete
39 of 3906: complete
40 of 3906: complete
41 of 3906: complete
42 of 3906: complete
43 of 3906: complete
44 of 3906: complete
45 of 3906: complete
46 of 3906: complete
47 of 3906: complete
48 of 3906: complete
4

In [18]:
# Generating testing data to image
i = 1
n = len(test)
for index, row in test.iterrows():
    font_size = random.choice(font_sizes)
    font = random.choice(fonts)
    bg = random.choice(bg_colors)
    noise_level = random.choice(noise_levels)
    blur_level = random.choice(blur_levels)
    gen_khmer_text_image(
        index=index+1, 
        content=row["word"],
        data_type="test", 
        bg=bg, 
        noise_level=noise_level, 
        blur_level=blur_level,
        font_path=font, 
        font_size=font_size,
        data_folder=data_folder
    )
    print(f"{i} of {n}: complete")
    i = i +1

1 of 9765: complete
2 of 9765: complete
3 of 9765: complete
4 of 9765: complete
5 of 9765: complete
6 of 9765: complete
7 of 9765: complete
8 of 9765: complete
9 of 9765: complete
10 of 9765: complete
11 of 9765: complete
12 of 9765: complete
13 of 9765: complete
14 of 9765: complete
15 of 9765: complete
16 of 9765: complete
17 of 9765: complete
18 of 9765: complete
19 of 9765: complete
20 of 9765: complete
21 of 9765: complete
22 of 9765: complete
23 of 9765: complete
24 of 9765: complete
25 of 9765: complete
26 of 9765: complete
27 of 9765: complete
28 of 9765: complete
29 of 9765: complete
30 of 9765: complete
31 of 9765: complete
32 of 9765: complete
33 of 9765: complete
34 of 9765: complete
35 of 9765: complete
36 of 9765: complete
37 of 9765: complete
38 of 9765: complete
39 of 9765: complete
40 of 9765: complete
41 of 9765: complete
42 of 9765: complete
43 of 9765: complete
44 of 9765: complete
45 of 9765: complete
46 of 9765: complete
47 of 9765: complete
48 of 9765: complete
4