<a href="https://colab.research.google.com/github/rbflakt/DL_project/blob/main/DL_project_YOLO%2BCLIP%2BBLIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##test 데이터 description 작성

In [None]:
import os

# 경로 설정
label_dir = "/content/drive/MyDrive/gdy/강다영_yolo버전"
description_file_path = "/content/drive/MyDrive/gdy/강다영_yolo버전/captions_only.txt"
output_dir = "/content/drive/MyDrive/gdy/강다영_yolo버전/output"

os.makedirs(output_dir, exist_ok=True)  # output 폴더가 없으면 생성

# description.txt에서 모든 문장 불러오기
with open(description_file_path, "r", encoding="utf-8") as desc_file:
    descriptions = [line.strip() for line in desc_file.readlines() if line.strip()]

desc_index = 0  # description 리스트 인덱스

# 1~14 이미지 라벨 처리
for i in range(1, 15):
    input_file = os.path.join(label_dir, f"image{i}.txt")
    output_file = os.path.join(output_dir, f"output_image{i}.txt")

    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
        for line in infile:
            parts = line.strip().split()

            if not parts or len(parts) < 5:
                continue  # skip malformed line

            class_id = parts[0]

            # 0번 클래스: description 추가
            if class_id == "0":
                if desc_index < len(descriptions):
                    description = descriptions[desc_index]
                    desc_index += 1
                else:
                    description = ""  # 남은 설명 없으면 빈칸

                outfile.write(f"{' '.join(parts)} {description}\n")

            # 1번 클래스: 그대로 출력
            elif class_id == "1":
                outfile.write(line)


##ultralytics 라이브러리 설치

In [None]:
%pip install ultralytics
import ultralytics
ultralytics.checks()

Ultralytics 8.3.156 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
Setup complete ✅ (2 CPUs, 12.7 GB RAM, 41.5/235.7 GB disk)


##드라이브 연동

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##이미지 증강_hcv

In [None]:
import cv2
import numpy as np
import os
from PIL import Image
import shutil
from tqdm import tqdm

# ✅ 1. 사용자 입력: 경로 설정
image_folder = "/content/drive/MyDrive/gdy/train_dataset/train_image"       # 🔹 원본 이미지 폴더 (.png)
label_folder = "/content/drive/MyDrive/gdy/train_dataset/train_label"       # 🔹 원본 라벨 폴더 (.txt)
save_image_folder = "/content/drive/MyDrive/gdy/images/train"  # 🔹 저장할 증강 이미지 폴더
save_label_folder = "/content/drive/MyDrive/gdy/labels/train"  # 🔹 저장할 증강 라벨 폴더

os.makedirs(save_image_folder, exist_ok=True)
os.makedirs(save_label_folder, exist_ok=True)

# ✅ 2. HSV 증강 함수
def apply_hsv_augmentation(img, h_shift=0.0, s_shift=0.0, v_shift=0.0):
    hsv_img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV).astype(np.float32)
    h, s, v = cv2.split(hsv_img)
    h = (h + h_shift * 179) % 179
    s = np.clip(s + s_shift * 255, 0, 255)
    v = np.clip(v + v_shift * 255, 0, 255)
    hsv_aug = cv2.merge([h, s, v]).astype(np.uint8)
    return cv2.cvtColor(hsv_aug, cv2.COLOR_HSV2RGB)

# ✅ 3. 독립적 증강 파라미터 설정
hue_values = [-0.5, -0.25, 0.0, 0.25, 0.5]
sat_values = [-1.0, -0.5, 0.0, 0.5, 1.0]
val_values = [-0.8, -0.4, 0.0, 0.4, 0.8]

# ✅ 4. 이미지+라벨 반복 증강 및 저장
image_files = [f for f in os.listdir(image_folder) if f.endswith(".png")]

for image_name in tqdm(image_files, desc="Augmenting images"):
    base_name = os.path.splitext(image_name)[0]
    image_path = os.path.join(image_folder, image_name)
    label_path = os.path.join(label_folder, base_name + ".txt")

    if not os.path.exists(label_path):
        print(f"⚠️ 라벨 파일 없음: {label_path}")
        continue

    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # 🔹 색조만 증강 (5개)
    for i, h in enumerate(hue_values):
        aug = apply_hsv_augmentation(img, h_shift=h)
        aug_name = f"{base_name}_h{i}.jpg"
        Image.fromarray(aug).save(os.path.join(save_image_folder, aug_name))
        shutil.copy(label_path, os.path.join(save_label_folder, f"{base_name}_h{i}.txt"))

    # 🔹 채도만 증강 (5개)
    for i, s in enumerate(sat_values):
        aug = apply_hsv_augmentation(img, s_shift=s)
        aug_name = f"{base_name}_s{i}.jpg"
        Image.fromarray(aug).save(os.path.join(save_image_folder, aug_name))
        shutil.copy(label_path, os.path.join(save_label_folder, f"{base_name}_s{i}.txt"))

    # 🔹 밝기만 증강 (5개)
    for i, v in enumerate(val_values):
        aug = apply_hsv_augmentation(img, v_shift=v)
        aug_name = f"{base_name}_v{i}.jpg"
        Image.fromarray(aug).save(os.path.join(save_image_folder, aug_name))
        shutil.copy(label_path, os.path.join(save_label_folder, f"{base_name}_v{i}.txt"))

print("✅ 모든 이미지 및 라벨 증강 완료 (1장당 15장 생성)")


Augmenting images: 100%|██████████| 90/90 [02:20<00:00,  1.56s/it]

✅ 모든 이미지 및 라벨 증강 완료 (1장당 15장 생성)





##이미지 증강_geo

In [None]:
import cv2
import numpy as np
import os
from PIL import Image
import shutil
from tqdm import tqdm

# 📌 경로 설정 (사용자 수정 부분)
image_folder = "/content/drive/MyDrive/gdy/train_dataset/train_image"
label_folder = "/content/drive/MyDrive/gdy/train_dataset/train_label"
save_image_folder = "/content/drive/MyDrive/gdy/images/train"
save_label_folder = "/content/drive/MyDrive/gdy/labels/train"

os.makedirs(save_image_folder, exist_ok=True)
os.makedirs(save_label_folder, exist_ok=True)

# ✅ 증강 타입별 접두어 매핑
type_map = {
    "rotate": "ro",
    "translate": "tr",
    "scale": "sc",
    "shear": "sh",
    "perspective": "pe"
}

# ✅ 기하학적 변환 함수
def apply_geometric_augmentation(img, aug_type, value):
    height, width = img.shape[:2]
    center = (width / 2, height / 2)

    if aug_type == "rotate":
        matrix = cv2.getRotationMatrix2D(center, value, 1.0)
        return cv2.warpAffine(img, matrix, (width, height), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT)
    elif aug_type == "translate":
        tx = int(value * width)
        ty = int(value * height)
        matrix = np.float32([[1, 0, tx], [0, 1, ty]])
        return cv2.warpAffine(img, matrix, (width, height), borderMode=cv2.BORDER_REFLECT)
    elif aug_type == "scale":
        scale_factor = 1 + value
        matrix = cv2.getRotationMatrix2D(center, 0, scale_factor)
        return cv2.warpAffine(img, matrix, (width, height), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT)
    elif aug_type == "shear":
        shear_factor = np.tan(np.radians(value))
        matrix = np.array([[1, shear_factor, 0], [0, 1, 0]], dtype=np.float32)
        return cv2.warpAffine(img, matrix, (width, height), borderMode=cv2.BORDER_REFLECT)
    elif aug_type == "perspective":
        dx = int(value * width)
        dy = int(value * height)
        pts1 = np.float32([[0, 0], [width, 0], [0, height], [width, height]])
        pts2 = np.float32([[0 + dx, 0 + dy], [width - dx, 0 + dy],
                           [0 + dx, height - dy], [width - dx, height - dy]])
        matrix = cv2.getPerspectiveTransform(pts1, pts2)
        return cv2.warpPerspective(img, matrix, (width, height), borderMode=cv2.BORDER_REFLECT)
    else:
        raise ValueError("Unknown augmentation type")

# ✅ 변환 설정
augmentation_params = {
    "rotate": [-180, -90, 0, 90, 180],
    "translate": [-0.5, -0.25, 0, 0.25, 0.5],
    "scale": [-0.5, -0.25, 0, 0.25, 0.5],
    "shear": [-10, -5, 0, 5, 10],
    "perspective": [-0.001, -0.0005, 0, 0.0005, 0.001]
}

# ✅ 이미지 목록
image_files = sorted([f for f in os.listdir(image_folder) if f.endswith(".png")])

# ✅ 증강 실행
for idx, image_name in enumerate(tqdm(image_files, desc="Geometric augmentation")):
    base_number = f"{idx:04d}"  # ← 수정됨: 0부터 시작
    image_path = os.path.join(image_folder, image_name)
    label_path = os.path.join(label_folder, os.path.splitext(image_name)[0] + ".txt")

    if not os.path.exists(label_path):
        print(f"⚠️ 라벨 없음: {label_path}")
        continue

    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    for aug_type, values in augmentation_params.items():
        prefix = type_map[aug_type]
        for i, val in enumerate(values):
            aug_img = apply_geometric_augmentation(img, aug_type, val)
            filename = f"img_{base_number}_{prefix}{i}"
            img_path = os.path.join(save_image_folder, filename + ".jpg")
            lbl_path = os.path.join(save_label_folder, filename + ".txt")

            Image.fromarray(aug_img).save(img_path)
            shutil.copy(label_path, lbl_path)

print("✅ 모든 증강 이미지 및 라벨 파일 저장 완료 (파일명 숫자 0부터 시작)")


Geometric augmentation: 100%|██████████| 90/90 [02:59<00:00,  1.99s/it]

✅ 모든 증강 이미지 및 라벨 파일 저장 완료 (파일명 숫자 0부터 시작)





##yaml 생성

In [11]:
project_path = '/content/drive/MyDrive'

# data.yaml 생성
data_yaml = f"""
path: /content/drive/MyDrive/gdy
train: images/train
val: images/val
test: images/test

nc: 1
names: ["icon"]

"""

with open(f"{project_path}/data.yaml", "w") as f:
    f.write(data_yaml)

##yolo 학습

In [14]:
!pip install ultralytics
!yolo task=detect mode=val \
      model="/content/drive/MyDrive/gdy/best.pt" \
      data="/content/drive/MyDrive/data.yaml" \
      split=test

Ultralytics 8.3.158 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
Model summary (fused): 72 layers, 3,005,843 parameters, 0 gradients, 8.1 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.3±0.1 ms, read: 131.2±144.0 MB/s, size: 1136.3 KB)
[34m[1mval: [0mScanning /content/drive/MyDrive/gdy/labels/test.cache... 118 images, 7 backgrounds, 99 corrupt: 100% 125/125 [00:00<?, ?it/s]
[34m[1mval: [0m/content/drive/MyDrive/gdy/images/test/Screenshot 2025-06-09 at 4.55.53 PM.png: ignoring corrupt image/label: Label class 1 exceeds dataset class count 1. Possible class labels are 0-0
[34m[1mval: [0m/content/drive/MyDrive/gdy/images/test/Screenshot 2025-06-10 at 1.40.42 PM.png: ignoring corrupt image/label: Label class 1 exceeds dataset class count 1. Possible class labels are 0-0
[34m[1mval: [0m/content/drive/MyDrive/gdy/images/test/Screenshot 2025-06-10 at 1.41.30 PM.png: ignoring corrupt image/label: Label class 1 exceeds dataset class count 1. Possible class l

##BLIP+CLIP

In [3]:
import json
import os
from PIL import Image
from collections import defaultdict
from torchvision import transforms

import torch
from transformers import CLIPProcessor, CLIPModel
from transformers import BlipProcessor, BlipForConditionalGeneration

# 경로 설정
json_path = "/content/drive/MyDrive/gdy/instances_converted_fin.json"
image_folder = "/content/drive/MyDrive/gdy/images/test"

# 장비 설정
device = "cuda" if torch.cuda.is_available() else "cpu"

# CLIP 로드
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# BLIP 로드
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

# CLIP 후보 설명어
candidate_texts = [
    "WhatsApp logo",
    "Instagram icon",
    "Phone symbol",
    "Chat application logo",
    "Social media icon",
    "Facebook logo",
    "Twitter logo",
    "LinkedIn logo",
    "YouTube logo",
    "Adobe Premiere logo",
    "Adobe Illustrator logo",
    "Adobe Photoshop logo",
    "American flag",
    "British flag",
    "German flag",
    "Chinese flag",
    "Japanese flag"
]

# confidence 기준값
CONFIDENCE_THRESHOLD = 0.85

# COCO json 불러오기
with open(json_path, "r") as f:
    coco = json.load(f)

image_id_to_filename = {img['id']: img['file_name'] for img in coco['images']}

annotations_by_image = defaultdict(list)
for ann in coco['annotations']:
    image_id = ann['image_id']
    x, y, w, h = ann['bbox']
    category_id = ann['category_id']
    annotations_by_image[image_id].append({
        "bbox": [x, y, x + w, y + h],
        "category_id": category_id
    })

# CLIP 예측 함수
def generate_clip_caption(crop_img):
    inputs = clip_processor(text=candidate_texts, images=crop_img, return_tensors="pt", padding=True).to(device)
    outputs = clip_model(**inputs)
    probs = outputs.logits_per_image.softmax(dim=1)
    best_idx = probs.argmax().item()
    caption = candidate_texts[best_idx]
    confidence = probs[0][best_idx].item()
    return caption, confidence

# BLIP 캡션 생성
def generate_blip_caption(crop_img):
    inputs = blip_processor(images=crop_img, return_tensors="pt").to(device)
    out_ids = blip_model.generate(
        **inputs,
        max_length=20,
        num_beams=5,
        repetition_penalty=2.0,
        length_penalty=1.0,
        early_stopping=True
    )
    caption = blip_processor.decode(out_ids[0], skip_special_tokens=True)
    return caption

# 이미지 순회하며 처리
category_id_to_name = {cat["id"]: cat["name"] for cat in coco["categories"]}

collected_captions = []

for image_id, bbox_list in annotations_by_image.items():
    filename = image_id_to_filename[image_id]
    image_path = os.path.join(image_folder, filename)

    if not os.path.exists(image_path):
        print(f"[경고] 이미지 없음: {filename}")
        continue

    img = Image.open(image_path).convert("RGB")
    print(f"\n===== {filename} =====")
    for idx, box_data in enumerate(bbox_list):
        x1, y1, x2, y2 = map(int, box_data['bbox'])
        category_id = box_data['category_id']
        category_name = category_id_to_name.get(category_id, "")

        # 아이콘이 아니면 캡션 건너뜀
        if category_name != "icon":
            print(f"[{idx+1}] 카테고리 '{category_name}' → 캡션 생략")
            continue
        ###데이터 증강
        crop = img.crop((x1, y1, x2, y2))
        augment = transforms.Compose([
           transforms.RandomHorizontalFlip(p=0.5),
           transforms.ColorJitter(brightness=0.2, contrast=0.2),
           transforms.RandomRotation(degrees=15)
        ])

        # crop → Tensor → 증강 → PIL 다시 변환
        crop_tensor = transforms.ToTensor()(crop)
        augmented_tensor = augment(crop_tensor)
        augmented_crop = transforms.ToPILImage()(augmented_tensor)

        clip_caption, clip_conf = generate_clip_caption(crop)

        if clip_conf < CONFIDENCE_THRESHOLD:
            caption = generate_blip_caption(crop)
            source = "BLIP"
        else:
            caption = clip_caption
            source = f"CLIP ({clip_conf:.2f})"

        print(f"[{idx+1}] Cat: {category_name} | Box: ({x1}, {y1}, {x2}, {y2})")
        print(f" → Caption: {caption} [{source}]")
        collected_captions.append(caption)

save_path = "/content/drive/MyDrive/gdy/captions_only_fin.txt"
with open(save_path, "w", encoding="utf-8") as f:
    for cap in collected_captions:
        f.write(cap + "\n")

print(f"\n✅ 캡션만 저장 완료: {save_path}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[16] Cat: icon | Box: (1924, 334, 2092, 489)
 → Caption: Twitter logo [CLIP (0.98)]
[17] Cat: icon | Box: (2236, 334, 2409, 502)
 → Caption: Instagram icon [CLIP (0.90)]
[18] Cat: icon | Box: (2447, 334, 2609, 497)
 → Caption: a blue circle with an arrow in the middle [BLIP]
[19] Cat: icon | Box: (2640, 333, 2816, 479)
 → Caption: a blue airplane icon on a white background [BLIP]
[20] Cat: icon | Box: (2640, 517, 2806, 687)
 → Caption: Chat application logo [CLIP (0.92)]
[21] Cat: icon | Box: (2435, 517, 2606, 687)
 → Caption: LinkedIn logo [CLIP (1.00)]
[22] Cat: icon | Box: (2229, 527, 2400, 687)
 → Caption: WhatsApp logo [CLIP (0.97)]
[23] Cat: icon | Box: (501, 514, 672, 690)
 → Caption: Chinese flag [CLIP (0.96)]
[24] Cat: icon | Box: (304, 514, 464, 698)
 → Caption: the flag of spain [BLIP]
[25] Cat: icon | Box: (88, 511, 274, 701)
 → Caption: German flag [CLIP (1.00)]
[26] Cat: icon | Box: (104, 818, 269, 984)
 → Caption: Adobe P