In [11]:
import cv2
import numpy as np

def load_image(path):
    return cv2.imdecode(np.fromfile(path,dtype=np.uint8),cv2.IMREAD_COLOR)

def to_gray(img):
    return cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)

def clahe_and_denoise(gray):
    clahe = cv2.createCLAHE(clipLimit=3.0,tileGridSize=(8,8))
    g = clahe.apply(gray)
    g = cv2.fastNlMeansDenoising(g,None,10,7,21)
    return g

def adaptive_thresh(gray):
    return cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,31,15)

def preprocess_for_ocr(path):
    img = load_image(path)
    gray = to_gray(img)
    den = clahe_and_denoise(gray)
    th = adaptive_thresh(den)
    return img, den, th


In [None]:
from paddleocr import PaddleOCR
import easyocr
import pytesseract

paddle = PaddleOCR(use_angle_cls=True, lang='en')
reader = easyocr.Reader(['en'], gpu=False)

def run_all_engines(img):
    results = []
    try:
        paddle_res = paddle.ocr(img, cls=True)
        for line in paddle_res:
            _, (text, score) = line
            results.append({'text': text, 'score': float(score), 'engine': 'paddle'})
    except:
        pass
    
    try:
        easy_res = reader.readtext(img)
        for _, text, score in easy_res:
            results.append({'text': text, 'score': float(score), 'engine': 'easyocr'})
    except:
        pass
    
    try:
        t_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
        n = len(t_data['text'])
        for i in range(n):
            text = t_data['text'][i].strip()
            if text:
                conf = float(t_data['conf'][i]) if t_data['conf'][i] != '-1' else 0
                results.append({'text': text, 'score': conf/100.0, 'engine': 'tesseract'})
    except:
        pass
    return results


  paddle = PaddleOCR(use_angle_cls=True, lang='en')
[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\A\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\A\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\A\.paddlex\official_models\PP-LCNet_x1_0_textline_ori`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\A\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('en_PP-OCRv5_mobile_rec', None)[0m
[32mMod

In [None]:
import re
from difflib import SequenceMatcher

# Regex for the _1_ code pattern
CODE_REGEX = re.compile(r'\b[A-Za-z0-9]+_1_[A-Za-z0-9_]+\b')
CONFUSION_MAP = str.maketrans({'I':'1','l':'1','|':'1','!':'1','O':'0','o':'0'})

def normalize(text):
    return text.strip().replace(' ','').translate(CONFUSION_MAP)

def is_code(text):
    return bool(CODE_REGEX.search(text))

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()


In [15]:
import cv2

def extract_waybill_code(image_path):
    orig, desk, th = preprocess_for_ocr(image_path)
    imgs_to_try = [
        desk,
        th,
        cv2.resize(desk, None, fx=1.5, fy=1.5),
        cv2.resize(th, None, fx=1.5, fy=1.5)
    ]
    
    candidates = []
    for im in imgs_to_try:
        for r in run_all_engines(im):
            norm = normalize(r['text'])
            if norm:
                candidates.append({'norm': norm, 'raw': r['text'], 'score': r['score'], 'engine': r['engine']})
    
    # Strict regex match
    strict = [c for c in candidates if is_code(c['norm'])]
    if strict:
        best = sorted(strict, key=lambda x: (x['score'], len(x['norm'])), reverse=True)[0]
        return {'extracted': best['norm'], 'raw': best['raw'], 'engine': best['engine'], 'score': best['score']}
    
    # Fallback: contains '_1' substring
    fallback = [c for c in candidates if '_1' in c['norm'] or '1_' in c['norm']]
    if fallback:
        best = sorted(fallback, key=lambda x: (x['score'], len(x['norm'])), reverse=True)[0]
        return {'extracted': best['norm'], 'raw': best['raw'], 'engine': best['engine'], 'score': best['score'], 'fallback': True}
    
    return {'extracted': None, 'reason': 'no_candidate_found'}


In [None]:
from IPython.display import display
from ipywidgets import FileUpload

# File uploader widget
uploader = FileUpload(accept='.jpg,.jpeg,.png,.tiff', multiple=False)
display(uploader)


FileUpload(value=(), accept='.jpg,.jpeg,.png,.tiff', description='Upload')

In [20]:
if uploader.value:
    # Get the first uploaded file (handles tuple or dict)
    uploaded_file = uploader.value[0] if isinstance(uploader.value, tuple) else list(uploader.value.values())[0]
    
    # For tuple-based FileUpload
    if isinstance(uploaded_file, tuple) or hasattr(uploaded_file, 'content'):
        content = uploaded_file['content'] if isinstance(uploaded_file, dict) else uploaded_file.content
        filename = uploaded_file['name'] if isinstance(uploaded_file, dict) else uploaded_file.name

        with open(filename, 'wb') as f:
            f.write(content)
        
        result = extract_waybill_code(filename)
        print(result)
    else:
        print("Cannot read uploaded file, check ipywidgets version.")
else:
    print("No file uploaded.")


  paddle_res = paddle.ocr(img, cls=True)


{'extracted': '163629705512179520_1_1ps', 'raw': '163629705512179520_1_Ips', 'engine': 'easyocr', 'score': 0.9669298815120313}
