<a href="https://colab.research.google.com/github/seoseohee/embedded-pilot-project/blob/main/modelcomparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install -U "pillow<12" easyocr pytesseract transformers sentencepiece accelerate
!apt-get -qq update
!apt-get -qq install -y tesseract-ocr
!pip -q install -U opencv-python-headless


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.2/978.2 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.6/300.6 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hW: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [2]:
#content 비우기
!rm -f /content/*.png /content/*.jpg /content/*.jpeg /content/*.txt


In [3]:
#이미지 + txt 같이 업로드
from google.colab import files
uploaded = files.upload()
print("Uploaded:", list(uploaded.keys())[:20])

Saving 스크린샷 2026-01-21 111927.png to 스크린샷 2026-01-21 111927.png
Saving 스크린샷 2026-01-21 111927.txt to 스크린샷 2026-01-21 111927.txt
Uploaded: ['스크린샷 2026-01-21 111927.png', '스크린샷 2026-01-21 111927.txt']


In [4]:
#업로드된 이미지-정답 짝 확인
import os, glob

imgs = sorted(glob.glob("/content/*.png") + glob.glob("/content/*.jpg") + glob.glob("/content/*.jpeg"))
print("Images:", len(imgs))
missing = []
for p in imgs:
    base, _ = os.path.splitext(p)
    if not os.path.exists(base + ".txt"):
        missing.append(os.path.basename(p))
print("Missing GT txt for:", missing)

Images: 1
Missing GT txt for: []


In [5]:
#OCR 실행 + 결과 모으기
import time, re, os
import cv2
import pytesseract
import easyocr
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

def preprocess_gray(img_bgr):
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.resize(gray, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)
    return gray

# EasyOCR
reader = easyocr.Reader(['en'], gpu=False)

def ocr_easy(gray):
    t0 = time.time()
    out = reader.readtext(gray, detail=0, paragraph=False)
    return "\n".join(out), time.time()-t0

# Tesseract 기본
def ocr_tess(gray):
    t0 = time.time()
    txt = pytesseract.image_to_string(gray, lang="eng")
    return txt, time.time()-t0

# OpenCV+Tess (adaptive threshold + psm)
def ocr_opencv_tess(gray):
    t0 = time.time()
    bin_img = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
        31, 10
    )
    txt = pytesseract.image_to_string(bin_img, lang="eng", config="--oem 3 --psm 6")
    return txt, time.time()-t0

# TrOCR
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")

def ocr_trocr(img_bgr):
    t0 = time.time()
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    pil = Image.fromarray(img_rgb)
    pixel_values = processor(images=pil, return_tensors="pt").pixel_values
    ids = model.generate(pixel_values, max_new_tokens=128)
    txt = processor.batch_decode(ids, skip_special_tokens=True)[0]
    return txt, time.time()-t0

def read_gt(img_path):
    base, _ = os.path.splitext(img_path)
    with open(base + ".txt", "r", encoding="utf-8") as f:
        return f.read()

rows = []
for p in imgs:
    img = cv2.imread(p)
    gray = preprocess_gray(img)
    gt = read_gt(p)

    e_txt, e_t = ocr_easy(gray)
    t_txt, t_t = ocr_tess(gray)
    ot_txt, ot_t = ocr_opencv_tess(gray)
    tr_txt, tr_t = ocr_trocr(img)

    rows.append({
        "image": os.path.basename(p),
        "gt": gt,
        "easyocr": e_txt,
        "tesseract": t_txt,
        "opencv+tess": ot_txt,
        "trocr": tr_txt,
        "easy_t": e_t,
        "tess_t": t_t,
        "ot_t": ot_t,
        "trocr_t": tr_t
    })

print("Done:", len(rows))




Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



Done: 1


In [6]:
#정확도(CER/WER) 계산(진짜 비교)
import pandas as pd
import re

def normalize(s: str) -> str:
    s = s.replace("\r", "")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def levenshtein(a, b):
    n, m = len(a), len(b)
    dp = list(range(m+1))
    for i in range(1, n+1):
        prev = dp[0]
        dp[0] = i
        for j in range(1, m+1):
            cur = dp[j]
            cost = 0 if a[i-1] == b[j-1] else 1
            dp[j] = min(dp[j] + 1, dp[j-1] + 1, prev + cost)
            prev = cur
    return dp[m]

def cer(pred, gt):
    pred, gt = normalize(pred), normalize(gt)
    if len(gt) == 0:
        return 0.0 if len(pred)==0 else 1.0
    return levenshtein(pred, gt) / len(gt)

def wer(pred, gt):
    pred_w = normalize(pred).split()
    gt_w = normalize(gt).split()
    if len(gt_w) == 0:
        return 0.0 if len(pred_w)==0 else 1.0
    return levenshtein(pred_w, gt_w) / len(gt_w)

score_rows = []
for r in rows:
    gt = r["gt"]
    score_rows.append({
        "image": r["image"],
        "easy_CER": cer(r["easyocr"], gt),
        "easy_WER": wer(r["easyocr"], gt),
        "tess_CER": cer(r["tesseract"], gt),
        "tess_WER": wer(r["tesseract"], gt),
        "ot_CER": cer(r["opencv+tess"], gt),
        "ot_WER": wer(r["opencv+tess"], gt),
        "trocr_CER": cer(r["trocr"], gt),
        "trocr_WER": wer(r["trocr"], gt),
        "easy_t": r["easy_t"],
        "tess_t": r["tess_t"],
        "ot_t": r["ot_t"],
        "trocr_t": r["trocr_t"],
    })

score_df = pd.DataFrame(score_rows)
score_df

Unnamed: 0,image,easy_CER,easy_WER,tess_CER,tess_WER,ot_CER,ot_WER,trocr_CER,trocr_WER,easy_t,tess_t,ot_t,trocr_t
0,스크린샷 2026-01-21 111927.png,0.160656,0.226891,0.100546,0.210084,0.666667,1.0,0.993443,1.0,27.309038,4.694369,3.168855,5.839793


In [9]:
#이미지별 비교(같은 이미지에서 누가 제일 낮은지 보면 )
score_df[[
    "image",
    "easy_CER", "tess_CER", "ot_CER", "trocr_CER"
]]


Unnamed: 0,image,easy_CER,tess_CER,ot_CER,trocr_CER
0,스크린샷 2026-01-21 111927.png,0.160656,0.100546,0.666667,0.993443


In [8]:
#모델별 평균 성능
score_df.mean(numeric_only=True).sort_values()


Unnamed: 0,0
tess_CER,0.100546
easy_CER,0.160656
tess_WER,0.210084
easy_WER,0.226891
ot_CER,0.666667
trocr_CER,0.993443
ot_WER,1.0
trocr_WER,1.0
ot_t,3.168855
tess_t,4.694369
