In [1]:
# ✅ STEP 0: IMPORTS AND CONFIG
import os
import random
import shutil
from PIL import Image
import numpy as np
import cv2
import matplotlib.pyplot as plt
from ultralytics import YOLO

# === CONFIG ===
INPUT_DIR = r"C:\Users\franc\Desktop\SUMERI\glyph_images"
SYNTH_IMG_DIR = r"C:\Users\franc\Desktop\SUMERI\synthetic_sentences"
SYNTH_LBL_DIR = r"C:\Users\franc\Desktop\SUMERI\synthetic_sentences_labels"
YOLO_DATA_DIR = r"C:\Users\franc\Desktop\SUMERI\yolo_dataset"
CANVAS_SIZE = (1500, 150)
GLYPHS_PER_SENTENCE = 10
NUM_SENTENCES = 200

# === PREPARE FOLDERS ===
os.makedirs(SYNTH_IMG_DIR, exist_ok=True)
os.makedirs(SYNTH_LBL_DIR, exist_ok=True)
os.makedirs(os.path.join(YOLO_DATA_DIR, "images", "train"), exist_ok=True)
os.makedirs(os.path.join(YOLO_DATA_DIR, "labels", "train"), exist_ok=True)

# === STEP 1: SYNTHETIC SENTENCE GENERATION ===
glyph_files = sorted([f for f in os.listdir(INPUT_DIR) if f.endswith(".png")])[:10]
glyph_classes = {f: idx for idx, f in enumerate(glyph_files)}

for i in range(NUM_SENTENCES):
    canvas = Image.new('L', CANVAS_SIZE, color=255)
    x_offset = 5
    bboxes = []
    random.shuffle(glyph_files)

    for glyph_name in glyph_files:
        class_id = glyph_classes[glyph_name]
        glyph_path = os.path.join(INPUT_DIR, glyph_name)
        glyph = Image.open(glyph_path).convert('L')

        glyph_np = np.array(glyph)
        mask = glyph_np < 250
        if not np.any(mask):
            continue

        coords = np.argwhere(mask)
        y0, x0 = coords.min(axis=0)
        y1, x1 = coords.max(axis=0) + 1
        cropped = glyph.crop((x0, y0, x1, y1))

        max_height = CANVAS_SIZE[1] - 10
        if cropped.height > max_height:
            ratio = max_height / cropped.height
            new_size = (int(cropped.width * ratio), max_height)
            cropped = cropped.resize(new_size, resample=Image.BILINEAR)

        y_offset = (CANVAS_SIZE[1] - cropped.height) // 2
        canvas.paste(cropped, (x_offset, y_offset))

        bbox_x_center = (x_offset + cropped.width / 2) / CANVAS_SIZE[0]
        bbox_y_center = (y_offset + cropped.height / 2) / CANVAS_SIZE[1]
        bbox_width = cropped.width / CANVAS_SIZE[0]
        bbox_height = cropped.height / CANVAS_SIZE[1]
        bboxes.append((class_id, bbox_x_center, bbox_y_center, bbox_width, bbox_height))

        x_offset += cropped.width + 15

    img_path = os.path.join(SYNTH_IMG_DIR, f"sentence_{i}.png")
    canvas.convert("RGB").save(img_path)
    label_path = os.path.join(SYNTH_LBL_DIR, f"sentence_{i}.txt")
    with open(label_path, 'w') as f:
        for bbox in bboxes:
            f.write(f"{bbox[0]} {bbox[1]:.6f} {bbox[2]:.6f} {bbox[3]:.6f} {bbox[4]:.6f}\n")

print("✅ Synthetic sentences generated.")





✅ Synthetic sentences generated.


In [6]:
import os
import shutil

YOLO_DATA_DIR = r"C:\Users\franc\Desktop\SUMERI\yolo_dataset"
SENT_IMG_DIR = r"C:\Users\franc\Desktop\SUMERI\synthetic_sentences"
SENT_LBL_DIR = r"C:\Users\franc\Desktop\SUMERI\synthetic_sentences_labels"

for split in ["train", "val"]:  # assuming val might be added later
    os.makedirs(os.path.join(YOLO_DATA_DIR, "images", split), exist_ok=True)
    os.makedirs(os.path.join(YOLO_DATA_DIR, "labels", split), exist_ok=True)

for fn in os.listdir(SENT_IMG_DIR):
    if fn.endswith(".png"):
        shutil.copy(os.path.join(SENT_IMG_DIR, fn), os.path.join(YOLO_DATA_DIR, "images", "train", fn))

for fn in os.listdir(SENT_LBL_DIR):
    if fn.endswith(".txt"):
        shutil.copy(os.path.join(SENT_LBL_DIR, fn), os.path.join(YOLO_DATA_DIR, "labels", "train", fn))

print("✅ YOLO dataset folders populated.")


✅ YOLO dataset folders populated.


In [10]:
# Copy into YOLO folder
for fn in os.listdir(SENT_IMG_DIR):
    if fn.endswith(".png"):
        src = os.path.join(SENT_IMG_DIR, fn)
        dst = os.path.join(YOLO_DATA_DIR, "images", "train", fn)
        shutil.copy(src, dst)
for fn in os.listdir(SENT_LBL_DIR):
    if fn.endswith(".txt"):
        src = os.path.join(SENT_LBL_DIR, fn)
        dst = os.path.join(YOLO_DATA_DIR, "labels", "train", fn)
        shutil.copy(src, dst)

# ─── WRITE DATASET.YAML ───────────────────────────────────────────────────────
yaml_txt = f"""
path: {YOLO_DATA_DIR}
train: images/train
val: images/train

names:
"""
for cid in range(GLYPHS_PER_SENTENCE):
    yaml_txt += f"  {cid}: Glyph_{cid}\n"

with open(os.path.join(YOLO_DATA_DIR, "glyphs-seg.yaml"), "w") as f:
    f.write(yaml_txt)

# ─── TRAIN SEGMENTATION MODEL ─────────────────────────────────────────────────
model = YOLO("yolov8n.pt", task='segment')

model.train(
    data=os.path.join(YOLO_DATA_DIR, "glyphs-seg.yaml"),
    epochs=5,
    imgsz=640,
    batch=8,
    patience=10,
)

print("✅ Done: synthetic data with jitter & segmentation training complete!")

Ultralytics 8.3.120  Python-3.11.4 torch-2.1.0+cpu CPU (Intel Core(TM) i5-10210U 1.60GHz)
[34m[1mengine\trainer: [0mtask=detect, mode=train, model=yolov8n.pt, data=C:\Users\franc\Desktop\SUMERI\yolo_dataset\glyphs-seg.yaml, epochs=5, time=None, patience=10, batch=8, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=train2, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=

[34m[1mtrain: [0mScanning C:\Users\franc\Desktop\SUMERI\yolo_dataset\labels\train.cache... 200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 200/200 [00:00<?, ?it/s]

[34m[1mval: [0mFast image access  (ping: 0.20.0 ms, read: 3.30.4 MB/s, size: 44.3 KB)



[34m[1mval: [0mScanning C:\Users\franc\Desktop\SUMERI\yolo_dataset\labels\train.cache... 200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 200/200 [00:00<?, ?it/s]


Plotting labels to runs\detect\train2\labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000714, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns\detect\train2[0m
Starting training for 5 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/5         0G      1.335      4.089      1.096        183        640: 100%|██████████| 25/25 [01:39<00:00,  3.97s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:07<00:00,  1.85it/s]

                   all        200       2000     0.0311      0.878      0.152      0.112






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/5         0G     0.6482      3.114     0.8438        126        640: 100%|██████████| 25/25 [01:33<00:00,  3.75s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:07<00:00,  1.84it/s]

                   all        200       2000     0.0338      0.985      0.284       0.25






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        3/5         0G     0.5659      2.464     0.8235        143        640: 100%|██████████| 25/25 [01:35<00:00,  3.83s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:07<00:00,  1.84it/s]

                   all        200       2000      0.788       0.38       0.64      0.547






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        4/5         0G     0.4903      2.017     0.8183        143        640: 100%|██████████| 25/25 [01:33<00:00,  3.76s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:07<00:00,  1.78it/s]

                   all        200       2000      0.817      0.754      0.856      0.816






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        5/5         0G     0.4334      1.693     0.8071        148        640: 100%|██████████| 25/25 [01:34<00:00,  3.76s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:07<00:00,  1.82it/s]

                   all        200       2000      0.807      0.904      0.919      0.879






5 epochs completed in 0.143 hours.
Optimizer stripped from runs\detect\train2\weights\last.pt, 6.2MB
Optimizer stripped from runs\detect\train2\weights\best.pt, 6.2MB

Validating runs\detect\train2\weights\best.pt...
Ultralytics 8.3.120  Python-3.11.4 torch-2.1.0+cpu CPU (Intel Core(TM) i5-10210U 1.60GHz)
Model summary (fused): 72 layers, 3,007,598 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:07<00:00,  1.82it/s]


                   all        200       2000      0.807      0.904      0.919      0.879
               Glyph_0        200        200          1          1      0.995      0.963
               Glyph_1        200        200      0.956      0.976      0.984      0.975
               Glyph_2        200        200      0.931      0.338      0.836      0.832
               Glyph_3        200        200      0.891          1      0.995      0.806
               Glyph_4        200        200      0.786      0.775      0.899      0.866
               Glyph_5        200        200      0.505      0.975      0.925      0.919
               Glyph_6        200        200      0.528          1      0.925      0.925
               Glyph_7        200        200      0.999          1      0.995      0.887
               Glyph_8        200        200      0.501       0.98      0.636      0.635
               Glyph_9        200        200      0.978          1      0.995      0.982
Speed: 0.3ms preproce

In [None]:
import cv2
import numpy as np
from PIL import Image

# ─── PARAMETERS ────────────────────────────────────────────────────────────────
TEST_IMG_PATH = r"C:\Users\franc\Desktop\SUMERI\yolo_dataset\images\train\sentence_1.png"  # Path to a single test image

# ─── TEST PREDICTIONS ON SINGLE IMAGE ─────────────────────────────────────────
def test_model_on_image(img_path):
    # Load the test image
    img = Image.open(img_path)
    
    # Perform segmentation prediction
    results = model(img)  # Use the pre-trained model
    
    # The results are now a list of detections
    result = results[0]  # The first (and only) result
    
    # Get predictions (boxes, scores, etc.)
    boxes = result.boxes.xyxy.cpu().numpy()  # Bounding boxes (xyxy format)
    confidences = result.boxes.conf.cpu().numpy()  # Confidence scores
    class_ids = result.boxes.cls.cpu().numpy()  # Class IDs
    
    # Convert image for visualization (if needed)
    img_cv2 = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    
    # Draw predictions on the image
    for box, conf, class_id in zip(boxes, confidences, class_ids):
        x1, y1, x2, y2 = map(int, box)
        color = (0, 255, 0)  # Green color for bounding box
        cv2.rectangle(img_cv2, (x1, y1), (x2, y2), color, 2)
        
        # Draw label and confidence
        label = f"Glyph_{int(class_id)}: {conf:.2f}"
        cv2.putText(img_cv2, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # Show the image with OpenCV (no saving, just display)
    cv2.imshow("Test Image with Predictions", img_cv2)
    cv2.waitKey(0)  # Wait for a key press to close the window
    cv2.destroyAllWindows()

# Test the model on the single image
test_model_on_image(TEST_IMG_PATH)

print("✅ Done: Testing complete, results displayed!")



0: 64x640 1 Glyph_0, 1 Glyph_3, 1 Glyph_5, 1 Glyph_7, 1 Glyph_9, 50.8ms
Speed: 1.3ms preprocess, 50.8ms inference, 1.1ms postprocess per image at shape (1, 3, 64, 640)
