This notebook contains the original experimental workflow used during the project.  
Some cells are environment-specific (e.g., Colab) and are kept for reproducibility.

1. **Preprocessing**

In [1]:
'''
D:\PHONG\Coding\RandomPrj\OCR textDetection\Image_Preprocessor_Ver2
'''
'''
pipeline:
(Input) → Deskew (this's optional, but who knows :>>) → CLAHE (enhance contrast)
→ Denoise → Sharpen → Adaptive Threshold → Sharpen → (Output .gif)

1 Deskew: from becomehuman.com, somehow this code works pretty good, better than chatGPT's :)))

2 CLAHE: contrast limited adaptive histogram equalization -> helpful for images with imbalanced
brightness. clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
    clipLimit (default 2 - 3): the enhancement limitation. Low -> less noise, less constrast, vice versa
    tileGridSize (default(8,8)): I dunno what this do LOL

3: Denoise:
- GaussianBlur(image, (5,5), 0) -> use kernel to assign higer weight for pixels near center
and lesser weight for pixels far from center -> smoothen the image

    Cons: it might blur the word's bound if the text is small/thin/light -> lose sharpeness

- Bilateral Filter: it simultaneously smoothens and preserves the edges (better than Gaussian)

4: Sharpen: use the kernel with 5 at center, -1 on the main verticals and horizontals (ignore the
surroundings) -> enhance the edges

5: Adaptive threshold: automatically choose the level for each image's area to get binarized.
- Gray: convert to grayscale because all thresholds work on gray images (1 channel)

- MORPH_OPEN: erode then dilate, remove the small noises

'''


'''
What's new?
backup01: Convert the output into GIF format
backup02: beautiful deskew from stack overflow
backup03: add sharpen before and after adaptive threshold
'''
import cv2
import numpy as np
from pathlib import Path
import os
from PIL import Image, ImageSequence

  D:\PHONG\Coding\RandomPrj\OCR textDetection\Image_Preprocessor_Ver2


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Test preprocessing
class KoreanTextPreprocessorV3:

    def __init__(self, debug_mode=True):
        self.debug_mode = debug_mode
        self.debug_dir = Path("/content/debug_output_v7")
        if debug_mode:
            self.debug_dir.mkdir(exist_ok=True, parents=True)

    # --- Debug ---
    def _save_debug_image(self, image, name):
        if self.debug_mode:
            try:
                debug_path = self.debug_dir / f"{name}.png"
                cv2.imwrite(str(debug_path), image)
            except Exception as e:
                print(f"Warning: Could not save debug image {name}: {str(e)}")

    # --- Pipeline xử lý ảnh ---
    def enhance_local_contrast(self, image):
        lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
        l, a, b = cv2.split(lab)
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
        cl = clahe.apply(l)
        enhanced_lab = cv2.merge([cl, a, b])
        enhanced = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
        self._save_debug_image(enhanced, "1_clahe")
        return enhanced

    def denoise(self, image, method='gaussian'):
        if method == 'bilateral':
            denoised = cv2.bilateralFilter(image, d=7, sigmaColor=50, sigmaSpace=50)
        else:
            denoised = cv2.GaussianBlur(image, (3, 3), 0)
        self._save_debug_image(denoised, "2_denoised")
        return denoised

    def sharpen(self, image, kernel_type='light'):
        """
        kernel_type: 'light' (nhẹ), 'strong' (mạnh, mặc định cũ)
        """
        if kernel_type == 'light':
            kernel = np.array([[0, -1, 0],
                               [-1, 5, -1],
                               [0, -1, 0]])
        else:
            kernel = np.array([[-1, -1, -1],
                               [-1, 9, -1],
                               [-1, -1, -1]])
        sharpened = cv2.filter2D(image, -1, kernel)
        self._save_debug_image(sharpened, f"3_sharpened_{kernel_type}")
        return sharpened

    def adaptive_threshold(self, image, method='gaussian'):
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image
        self._save_debug_image(gray, "4_gray")
        th = cv2.ADAPTIVE_THRESH_MEAN_C if method == 'mean' else cv2.ADAPTIVE_THRESH_GAUSSIAN_C
        binary = cv2.adaptiveThreshold(gray, 255, th, cv2.THRESH_BINARY, blockSize=25, C=5)
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
        binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
        self._save_debug_image(binary, "5_adaptive_threshold")
        return binary

    # --- Deskew ---
    def getSkewAngle(self, cvImage) -> float:
        gray = cv2.cvtColor(cvImage, cv2.COLOR_BGR2GRAY)
        blur = cv2.GaussianBlur(gray, (9, 9), 0)
        thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5))
        dilate = cv2.dilate(thresh, kernel, iterations=5)
        contours, _ = cv2.findContours(dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
        if not contours:
            return 0.0
        largestContour = max(contours, key=cv2.contourArea)
        minAreaRect = cv2.minAreaRect(largestContour)
        angle = minAreaRect[-1]
        if angle < -45:
            angle = 90 + angle
        return -1.0 * angle

    def rotateImage(self, cvImage, angle: float):
        (h, w) = cvImage.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(cvImage, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
        return rotated

    def deskew(self, cvImage):
        angle = self.getSkewAngle(cvImage)
        if abs(angle) < 0.90:
            deskewed = self.rotateImage(cvImage, -1.0 * angle)
            self._save_debug_image(deskewed, "0_deskewed")
            return deskewed
        return cvImage

    # --- Smart pick frame ---
    def choose_best_frame(self, frames, max_frames=10):
        best_score = -1
        best_frame = frames[0]
        for i, f in enumerate(frames[:max_frames]):
            gray = cv2.cvtColor(f, cv2.COLOR_BGR2GRAY)
            _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            score = cv2.countNonZero(binary)
            if score > best_score:
                best_score = score
                best_frame = f
        return best_frame

    # --- Xử lý ảnh / GIF / WebP / JPG động ---
    def process_image(self, image_path, output_path=None, denoise_method='gaussian',
                      threshold_method='gaussian', enable_deskew=True,
                      process_all_frames=False, smart_pick=True):
        image_path = Path(image_path)
        if not image_path.exists():
            raise ValueError(f"File does not exist: {image_path}")

        # luôn dùng Pillow để mở, để detect ảnh động kể cả .jpg
        img = Image.open(str(image_path))
        is_animated = getattr(img, "is_animated", False)

        frames = []
        if is_animated:
            for i, frame in enumerate(ImageSequence.Iterator(img)):
                if not process_all_frames and i > 9:  # giới hạn 10 frame để tiết kiệm
                    break
                frame_cv = cv2.cvtColor(np.array(frame.convert("RGB")), cv2.COLOR_RGB2BGR)
                if enable_deskew:
                    frame_cv = self.deskew(frame_cv)
                frames.append(frame_cv)

            if smart_pick:
                image_cv = self.choose_best_frame(frames)
            else:
                image_cv = frames[0]
        else:
            image_cv = cv2.imread(str(image_path))
            if enable_deskew and image_cv is not None:
                image_cv = self.deskew(image_cv)

        if image_cv is None:
            raise ValueError(f"Could not load image: {image_path}")

        # pipeline preprocess
        processed = self.enhance_local_contrast(image_cv)
        processed = self.denoise(processed, method=denoise_method)
        processed = self.sharpen(processed)
        processed = self.adaptive_threshold(processed, method=threshold_method)
        # processed = self.sharpen(processed)

        # save nếu cần
        if output_path:
            ext_out = Path(output_path).suffix.lower()
            if ext_out not in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".gif", ".webp"]:
                ext_out = ".png"
            output_path = str(output_path).rsplit('.', 1)[0] + ext_out
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            Image.fromarray(processed).save(output_path)

        return processed

    # --- Xử lý thư mục ---
    def process_directory(self, input_dir, output_dir, process_all_frames=False, smart_pick=True):
        input_dir = Path(input_dir)
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True, parents=True)

        extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.gif', '.webp']
        all_files = []
        for ext in extensions:
            all_files.extend(input_dir.glob(f'*{ext}'))

        total_files = len(all_files)
        processed_files, failed_files = 0, 0

        print(f"\nFound {total_files} images to process in {input_dir}")

        for i, input_path in enumerate(all_files, 1):
            try:
                output_path = output_dir / f"processed_{input_path.name}"
                print(f"\nProcessing [{i}/{total_files}]: {input_path.name}")
                self.process_image(input_path, output_path,
                                   process_all_frames=process_all_frames,
                                   smart_pick=smart_pick)
                print(f"Success - Saved to: {output_path}")
                processed_files += 1
            except Exception as e:
                failed_files += 1
                print(f"Error processing {input_path.name}: {str(e)}")
                continue

        print(f"\nProcessing completed:")
        print(f"Total files: {total_files}")
        print(f"Successfully processed: {processed_files}")
        print(f"Failed: {failed_files}")


# --- Main chạy thẳng ---
def main():
    preprocessor = KoreanTextPreprocessorV3(debug_mode=True)

    base_dir = Path("/content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1")
    input_dir = base_dir /  "images_hyecho"/"TCA20172_00.jpg"
    output_dir = base_dir / "OutputImages" / "images_hyecho_demo"

    # Nếu muốn xử lý toàn bộ frame GIF, set process_all_frames=True
    preprocessor.process_directory(input_dir, output_dir,
                                   process_all_frames=False,
                                   smart_pick=True)


if __name__ == "__main__":
    main()



Found 0 images to process in /content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1/images_hyecho/TCA20172_00.jpg

Processing completed:
Total files: 0
Successfully processed: 0
Failed: 0


2. **Model**

In [None]:
#Gỡ torch để tránh xung đột với PaddleOCR
!pip uninstall -y torch torchvision torchaudio

[0m

In [None]:
#Install PaddleOCR (GPU, CUDA 12.6)
!python -m pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/pip/__main__.py", line 24, in <module>
    sys.exit(_main())
             ^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/main.py", line 78, in main
    command = create_command(cmd_name, isolated=("--isolated" in cmd_args))
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/commands/__init__.py", line 114, in create_command
    module = importlib.import_module(module_path)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/importlib/__init__.py", line 90, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen importlib._bootstrap>", line 1387, in _gcd_import


In [None]:
#Install paddle CPU
!pip install "paddleocr>=2.7"
!python -m pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/


Looking in indexes: https://www.paddlepaddle.org.cn/packages/stable/cpu/
Collecting paddlepaddle==3.2.0
  Downloading https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.2.0-cp312-cp312-linux_x86_64.whl (189.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.0/189.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting opt_einsum==3.3.0 (from paddlepaddle==3.2.0)
  Downloading https://paddle-whl.bj.bcebos.com/stable/cpu/opt-einsum/opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: opt_einsum, paddlepaddle
  Attempting uninstall: opt_einsum
    Found existing installation: opt_einsum 3.4.0
    Uninstalling opt_einsum-3.4.0:
      Successfully uninstalled opt_einsum-3.4.0
Successfully installed opt_einsum-3.3.0 paddlepaddle-3.2.0


In [None]:
# If you only want to use the basic text recognition feature (returns text position coordinates and content), including the PP-OCR series
!python -m pip install paddleocr
# If you want to use all features such as document parsing, document understanding, document translation, key information extraction, etc.
# python -m pip install "paddleocr[all]"



In [None]:
from paddleocr import PaddleOCR
# ===== Config =====
INPUT_DIR = "/content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1/OutputImages_ver4/images_hyecho_demo"    #  folder chứa ảnh
OUTPUT_DIR = "/content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1/OutputOCR_ver4/images_hyecho_demo_processed"  #  folder lưu kết quả


os.makedirs(OUTPUT_DIR, exist_ok=True)

# ===== Initialize PaddleOCR =====
ocr = PaddleOCR(
    lang='korean',
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False
)

# ===== Run OCR on all images in folder =====
img_exts = [".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff",".gif",".webp"]

for img_file in Path(INPUT_DIR).glob("*"):
    if img_file.suffix.lower() in img_exts:
        print(f"Processing: {img_file.name}")
        result = ocr.predict(str(img_file))

        # Tạo folder con cho mỗi ảnh
        save_base = Path(OUTPUT_DIR) / img_file.stem
        os.makedirs(save_base, exist_ok=True)

        # Lưu kết quả
        for res in result:
            res.print()
            try:
                # Một số kết quả có font_size = 0 gây lỗi -> bắt và bỏ qua
                res.save_to_img(str(save_base))
            except ValueError as e:
                if "font size must be greater than 0" in str(e):
                    print(f"Bỏ qua save_to_img cho {img_file.name} (font size = 0)")
                else:
                    raise
            res.save_to_json(str(save_base))

print("Done! Check results in:", OUTPUT_DIR)

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/PP-OCRv5_server_det`.[0m
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

.gitattributes: 0.00B [00:00, ?B/s]

inference.json: 0.00B [00:00, ?B/s]

inference.yml:   0%|          | 0.00/903 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

inference.pdiparams:   0%|          | 0.00/87.9M [00:00<?, ?B/s]

[32mCreating model: ('korean_PP-OCRv5_mobile_rec', None)[0m
[32mUsing official model (korean_PP-OCRv5_mobile_rec), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/korean_PP-OCRv5_mobile_rec`.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

inference.pdiparams:   0%|          | 0.00/13.3M [00:00<?, ?B/s]

inference.json: 0.00B [00:00, ?B/s]

inference.yml: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

Processing: processed_TCA20172_00.jpg


[32m{'res': {'input_path': '/content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1/OutputImages_ver4/images_hyecho_processed/processed_TCA20172_00.jpg', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[918,   0],
        ...,
        [919, 880]],

       ...,

       [[749, 659],
        ...,
        [749, 734]]], dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'return_word_box': False, 'rec_texts': ['닥이는별들 푸른초원에서의기억은잊을수없는추억이되었습니', '을할수있는점이위안이되고감사하다.-신 고객님-', '한풍광은 정말멋졌습니다. -아0 고객님-', '생생한 후기를 확인해보세요!', '시 가고 싶어 하는 다채로운 트려', 

Processing: processed_TCA20172_01.jpg


[32m{'res': {'input_path': '/content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1/OutputImages_ver4/images_hyecho_processed/processed_TCA20172_01.jpg', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[1827, 4598],
        ...,
        [1827, 4766]],

       ...,

       [[2072, 5923],
        ...,
        [2075, 6442]]], dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'return_word_box': False, 'rec_texts': ['IN/OUT', '비슈케크', '•', 'Z', '유네스코 지정 청정국가를', '국내 최초 알틴아라산 상품판매', '천산산맥', '출폰아타', '키르기스스탄 트레킹의 매력 속으로 함께 떠나보실까요?', '', '

Processing: processed_TCA20172_02.jpg


[32m{'res': {'input_path': '/content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1/OutputImages_ver4/images_hyecho_processed/processed_TCA20172_02.jpg', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[1216, 1829],
        ...,
        [1216, 2791]],

       ...,

       [[1304, 3011],
        ...,
        [1304, 3579]]], dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'return_word_box': False, 'rec_texts': ['야생화, 푸른 초원을 걸는 트레킹', '알틴아라산계곡을 따라', '', '트레킹둘째날', 'A', '일린아라산의온천수는 현지인들사이에서위장병 관절영류어리즘치료에 효과가있는것으로알려져있다고합니다', "알틴아라산의의

Processing: processed_TCA20172_03.jpg


[32m{'res': {'input_path': '/content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1/OutputImages_ver4/images_hyecho_processed/processed_TCA20172_03.jpg', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[  48, 1524],
        ...,
        [  46, 3530]],

       ...,

       [[ 929, 2975],
        ...,
        [ 929, 2990]]], dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'return_word_box': False, 'rec_texts': ['악천후를 대비하여 바람막이,우비,여톱용 스패츠, 알은 경량패딩을 준비하는 것이 좋습니다. 트레킹 신발은 필히 발목을 보호할 수', '알틴아라산 트레킹기후는매우붙규칙합니다.한여름에도우박이내릴수있어요트레킹복장은봄,가

Processing: processed_TCA20184_00.png


[32m{'res': {'input_path': '/content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1/OutputImages_ver4/images_hyecho_processed/processed_TCA20184_00.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[1037,    0],
        ...,
        [1035,  238]],

       ...,

       [[1169,  215],
        ...,
        [1176,  244]]], dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'return_word_box': False, 'rec_texts': ['은 여행 상품으', '발일 및 일정이', '를비중', 'nO'], 'rec_scores': array([0.83226204, ..., 0.40819865]), 'rec_polys': array([[[1037,  

Done! Check results in: /content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1/OutputOCR_ver4/images_hyecho_ocr_processed


In [None]:
!pip install konlpy


Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.0 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m75.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (495 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m495.9/495.9 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.6.0 konlpy-0.6.0


In [None]:

from konlpy.tag import Okt
from pathlib import Path


def segment_korean_text(text: str, tokenizer=None) -> str:
    """
    text: chuỗi OCR tiếng Hàn
    tokenizer: instance Okt hoặc Mecab
    return: chuỗi đã tách từ bằng khoảng trắng
    """
    if tokenizer is None:
        tokenizer = Okt()
    words = tokenizer.morphs(text)  # tách từ
    return " ".join(words)


def improve_korean_ocr(input_path: str, output_path: str):
    input_file = Path(input_path)
    output_file = Path(output_path)

    if not input_file.exists():
        print(" File đầu vào không tồn tại:", input_file)
        return


    lines = input_file.read_text(encoding="utf-8").splitlines()


    okt = Okt()

    improved_lines = []
    for line in lines:
        line_clean = " ".join(line.split())  # chuẩn hóa khoảng trắng
        improved_line = segment_korean_text(line_clean, okt)
        improved_lines.append(improved_line)


    output_file.write_text("\n".join(improved_lines), encoding="utf-8")
    print(f" File đã được lưu tại: {output_file}")


input_drive_path = "/content/drive/MyDrive/OFFICIAL_TEST_FOR_PHASE1/TEST_FOR_PHASE1/OutputOCR_ver4/images_hyecho_demo_processed/processed_A000000173589_03_res.txt"
output_drive_path = "/content/drive/MyDrive/input_ocr_A000000173589_03_segmented.txt"
improve_korean_ocr(input_drive_path, output_drive_path)


✅ File đã được lưu tại: /content/drive/MyDrive/input_ocr_A000000173589_03_segmented.txt
