# CLIP socre


In [None]:
# !pip install torch torchvision transformers pillow
# !pip install clip-score
# !pip install git+https://github.com/openai/CLIP.git

# ver1

In [6]:
import torch
import clip
from PIL import Image
from transformers import pipeline

# CLIP 모델 로드
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# 요약 모델 로드
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# 이미지 경로와 프롬프트 설정
image_path = "/content/drive/MyDrive/[빅컴]프로젝트/vlm-main/Data/KFashion_Image/Image_Data/1.jpg"
positive_prompt = {
    "Prompt": {
        "persona": "You are a fashion designer targeting Korean women in their 20s and 30s.",
        "task_description": "Generate a full-body outfit image based on Input and Add_Info.",
        "constraint": "Exclude the face, and the image must be in photo format."
    },
    "Input": {
        "caption": "Street style: Long beige jumper, ankle-length rolled-up blue jeans, white loose T-shirt."
    },
    "Add_Info": "Occasion: Picnic, Season: Spring, Shoes: Mules."
}

negative_prompt = {
    "Prompt": {
        "persona": "You are a casual fashion critic reviewing outfits.",
        "task_description": "Criticize the given outfit based on Input and Add_Info.",
        "constraint": "Provide critical feedback and avoid compliments."
    },
    "Input": {
        "caption": "Street style: Long beige jumper, ankle-length rolled-up blue jeans, white loose T-shirt."
    },
    "Add_Info": "Occasion: Picnic, Season: Spring, Shoes: Mules."
}

# 텍스트 요약 함수
def summarize_text(prompt, summarizer, max_length=50):
    # 딕셔너리를 문자열로 변환
    prompt_text = (
        f"Persona: {prompt['Prompt']['persona']}\n"
        f"Task: {prompt['Prompt']['task_description']}\n"
        f"Constraint: {prompt['Prompt']['constraint']}\n"
        f"Input Caption: {prompt['Input']['caption']}\n"
        f"Additional Info: {prompt['Add_Info']}"
    )
    # Summarizer 호출
    summary = summarizer(prompt_text, max_length=max_length, min_length=25, do_sample=False)
    return summary[0]['summary_text']

# 프롬프트 요약
positive_prompt_summary = summarize_text(positive_prompt, summarizer, max_length=50)
negative_prompt_summary = summarize_text(negative_prompt, summarizer, max_length=50)

print(f"Summarized Positive Prompt: {positive_prompt_summary}")
print(f"Summarized Negative Prompt: {negative_prompt_summary}")

# 텍스트 프롬프트 토큰화
text_prompts = [positive_prompt_summary, negative_prompt_summary]
text_tokens = clip.tokenize(text_prompts).to(device)

# 이미지 로드 및 전처리
image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

# CLIP 모델로 유사도 계산
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_tokens)

    # 유사도 계산 (Cosine Similarity)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (image_features @ text_features.T).squeeze(0).cpu().numpy()

# 결과 출력
print(f"Positive Prompt Similarity: {similarity[0]:.4f}")
print(f"Negative Prompt Similarity: {similarity[1]:.4f}")


Summarized Positive Prompt: Fashion designer targeting Korean women in their 20s and 30s. Generate a full-body outfit image based on Input and Add_Info. Exclude the face, and the image must be in photo format.
Summarized Negative Prompt: Criticize the given outfit based on Input and Add_Info. Aim is to provide critical feedback and avoid compliments.
Positive Prompt Similarity: 0.3010
Negative Prompt Similarity: 0.2422


In [8]:
import torch
import clip
from PIL import Image
from transformers import pipeline

# CLIP 모델 로드
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# 요약 모델 로드
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# 이미지 경로와 프롬프트 설정
image_path = "/content/drive/MyDrive/[빅컴]프로젝트/vlm-main/Data/KFashion_Image/Image_Data/1.jpg"
prompt = {
    "Prompt": {
        "persona": "You are a fashion designer for women in their 20s-30s.",
        "task_description": "Generate a dressed woman based on Input.",
        "constraint": "Show a full-body photo, excluding the face."
    },
    "Input": {
        "caption": "Style: ['Modern'], Outer: ['Cardigan'], Bottom: {'length': ['Maxi'], 'color': ['Black'], 'print': ['Leopard'], 'fit': ['Oversized']}, Top: {'color': ['Black'], 'category': ['Knitwear'], 'material': ['Wool/Cashmere'], 'fit': ['Oversized']}"
    },
    "Add_Info": "Occasion: ['Autumn evening'], Shoes: ['Long boots']"
}

# 텍스트 요약 함수
def summarize_text(prompt, summarizer, max_length=50):
    # 딕셔너리를 문자열로 변환
    prompt_text = (
        f"Persona: {prompt['Prompt']['persona']}\n"
        f"Task: {prompt['Prompt']['task_description']}\n"
        f"Constraint: {prompt['Prompt']['constraint']}\n"
        f"Input Caption: {prompt['Input']['caption']}\n"
        f"Additional Info: {prompt['Add_Info']}"
    )
    # Summarizer 호출
    summary = summarizer(prompt_text, max_length=max_length, min_length=25, do_sample=False)
    return summary[0]['summary_text']

# 프롬프트 요약
prompt_summary = summarize_text(prompt, summarizer, max_length=50)

print(f"Summarized Prompt: {prompt_summary}")

# 텍스트 프롬프트 토큰화
text_tokens = clip.tokenize([prompt_summary]).to(device)

# 이미지 로드 및 전처리
image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

# CLIP 모델로 유사도 계산
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_tokens)

    # 유사도 계산 (Cosine Similarity)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (image_features @ text_features.T).squeeze(0).cpu().numpy()

# 결과 출력
print(f"Prompt Similarity: {similarity[0]:.4f}")


Summarized Prompt: Fashion designer for women in their 20s-30s needs to show a full-body photo, excluding the face. The photo must be of a woman in her 20s or 30s.
Prompt Similarity: 0.2822


# KID

In [None]:
import torch
import torchvision.transforms as transforms
from torchvision.models import inception_v3
from sklearn.metrics.pairwise import polynomial_kernel
import numpy as np
from PIL import Image

# Inception 모델 로드
def load_inception_model(device):
    model = inception_v3(pretrained=True, transform_input=False)
    model.fc = torch.nn.Identity()  # Fully connected layer 제거
    model.eval()
    model.to(device)
    return model

# 이미지 전처리
def preprocess_image(image_path, image_size=299):
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return transform(Image.open(image_path).convert("RGB")).unsqueeze(0)

# Inception 모델을 사용하여 특성 추출
def extract_features(model, image, device="cpu"):
    with torch.no_grad():
        image = image.to(device)
        feature = model(image)
        return feature.cpu().numpy()

# KID 계산 함수 (단일 이미지)
def calculate_kid_single(real_features, fake_features):
    real_kernel = polynomial_kernel(real_features, real_features, degree=3, gamma=None, coef0=1)
    fake_kernel = polynomial_kernel(fake_features, fake_features, degree=3, gamma=None, coef0=1)
    mixed_kernel = polynomial_kernel(real_features, fake_features, degree=3, gamma=None, coef0=1)

    m = real_kernel.shape[0]
    n = fake_kernel.shape[0]

    kid = real_kernel.sum() / (m * m) + fake_kernel.sum() / (n * n) - 2 * mixed_kernel.sum() / (m * n)
    return kid

# 실제와 생성된 이미지로 KID 계산
def compute_kid_single(real_image_path, fake_image_path, device="cpu"):
    model = load_inception_model(device)

    real_image_tensor = preprocess_image(real_image_path).to(device)
    fake_image_tensor = preprocess_image(fake_image_path).to(device)

    real_features = extract_features(model, real_image_tensor, device=device)
    fake_features = extract_features(model, fake_image_tensor, device=device)

    kid = calculate_kid_single(real_features, fake_features)
    return kid

# 예제: 실제 이미지와 생성된 이미지 경로
real_image_path = "/content/drive/MyDrive/[빅컴]프로젝트/vlm-main/Generate_Image_Data/Label_Image/11739.jpg"
fake_image_path = "/content/drive/MyDrive/[빅컴]프로젝트/vlm-main/Generate_Image_Data/Generate_Image/dalle3_11739.jpg"

# KID 계산
device = "cuda" if torch.cuda.is_available() else "cpu"
kid_value = compute_kid_single(real_image_path, fake_image_path, device=device)
print(f"KID Value: {kid_value:.6f}")


Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:01<00:00, 97.2MB/s]


KID Value: 0.695170
