In [None]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

中文CLIP huggingface demo

In [None]:
from PIL import Image
import requests
import clip
import torch
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from transformers import CLIPProcessor, CLIPModel
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

query_texts = ["一只猫", "一只狗",'两只猫', '两只老虎','一只老虎']  # 这里是输入文本的，可以随意替换。
# 加载Taiyi 中文 text encoder
text_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese")
text_encoder = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese").eval().to(device)
text = text_tokenizer(query_texts, return_tensors='pt', padding=True)['input_ids'].to(device)

url = "https://hbimg.huaban.com/e637198ad1a5a0b4347d1a21abdd4a6118bd5accb4a23-etvyBB_fw658"  # 这里可以换成任意图片的url
image_path = "/content/drive/MyDrive/picture_data/宠物猫_2.png"
# 加载CLIP的image encoder
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14", device=device)
# image = processor(images=Image.open(requests.get(url, stream=True).raw), return_tensors="pt")
image = processor(images=(Image.open(image_path)), return_tensors="pt").to(device)


with torch.no_grad():
    image_features = clip_model.get_image_features(**image)
    text_features = text_encoder(text).logits
    # 归一化
    image_features = image_features / image_features.norm(dim=1, keepdim=True)
    text_features = text_features / text_features.norm(dim=1, keepdim=True)
    # 计算余弦相似度 logit_scale是尺度系数
    logit_scale = clip_model.logit_scale.exp()
    logits_per_image = logit_scale * image_features @ text_features.t()
    similarity = image_features @ text_features.T
    print(similarity)
    logits_per_text = logits_per_image.t()
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
    print(np.around(probs, 3))

tensor([[ 0.1047,  0.0269,  0.0761, -0.0248,  0.0007]], device='cuda:0')
[[0.946 0.    0.054 0.    0.   ]]


计算类别概率/相似度

In [None]:
def predict(image, text):
    with torch.no_grad():
        image_features = clip_model.get_image_features(**image)
        text_features = text_encoder(text).logits
        # 归一化
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)
        # 计算余弦相似度 logit_scale是尺度系数
        logit_scale = clip_model.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
        return probs

def cal_similarity(image, text):
    with torch.no_grad():
        image_features = clip_model.get_image_features(**image)
        text_features = text_encoder(text).logits
        # 归一化
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)
        # 计算余弦相似度 logit_scale是尺度系数
        logit_scale = clip_model.logit_scale.exp()
        similarity = image_features @ text_features.t()
        # logits_per_text = logits_per_image.t()
        # probs = logits_per_image.softmax(dim=-1).cpu().numpy()
        return similarity.cpu().numpy()

卡阈值（单类别+others）

In [None]:
from PIL import Image
import requests
import clip
import torch
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

text_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese")
text_encoder = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese").eval().to(device)

clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14", device=device)

# 定义类别和对应的图片命名规则
categories = ["宠物猫", "番茄", "剪纸", "电脑", "饺子"]

# 初始化统计数据
stats = {category: {"TP": 0, "FP": 0, "FN": 0} for category in categories}

# 图片文件夹路径
image_folder_path = "/content/drive/MyDrive/picture_data"

# 设定阈值
threshold = 0.99

# 初始化用于计算平均的变量
total_precision = 0
total_recall = 0
total_f1 = 0

# 对每个类别分别计算precision和recall
for category in categories:
    # 初始化统计数据
    TP = 0
    FP = 0
    FN = 0

    # 遍历图片
    for image_name in os.listdir(image_folder_path):
        if image_name.endswith(".png"):
            image_path = os.path.join(image_folder_path, image_name)

            # 预处理图片
            image = processor(images=(Image.open(image_path)), return_tensors="pt").to(device)

            # 准备文本
            text_labels = [category, "其他"]
            # text = clip.tokenize(list(categories.keys()) + ["others"]).to(device)
            text = text_tokenizer(text_labels, return_tensors='pt', padding=True)['input_ids'].to(device)

            # 进行预测
            probs = predict(image, text)

            # 使用阈值判断类别
            is_positive_prediction = probs[0, 0] > threshold
            predicted_category = category if is_positive_prediction else "others"

            # 判断真实类别
            actual_category = category if category in image_name else "others"

            # 更新统计数据
            if predicted_category == actual_category and actual_category == category:
                TP += 1
            elif predicted_category == category and actual_category == "others":
                FP += 1
            elif actual_category == category and predicted_category == "others":
                FN += 1

    # 计算Precision和Recall
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0


    print(f"Category: {category}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")
    total_precision += precision
    total_recall += recall
    total_f1 += f1_score

# 计算并打印平均Precision和Recall
average_precision = total_precision / len(categories)
average_recall = total_recall / len(categories)
average_f1 = total_f1 / len(categories)

print(f"Average Precision: {average_precision:.4f}, Average Recall: {average_recall:.4f}, Average F1: {average_f1:.4f}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Category: 宠物猫, Precision: 0.7866, Recall: 0.9950, F1 Score: 0.8786
Category: 番茄, Precision: 0.5587, Recall: 1.0000, F1 Score: 0.7168
Category: 剪纸, Precision: 0.8032, Recall: 1.0000, F1 Score: 0.8909
Category: 电脑, Precision: 0.6576, Recall: 0.9700, F1 Score: 0.7838
Category: 饺子, Precision: 0.4158, Recall: 1.0000, F1 Score: 0.5874
Average Precision: 0.6444, Average Recall: 0.9930, Average F1: 0.7715


卡阈值（单类别相似度）

In [None]:
from PIL import Image
import requests
import clip
import torch
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

text_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese")
text_encoder = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese").eval().to(device)

clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14", device=device)

# 定义类别和对应的图片命名规则
categories = ["宠物猫", "番茄", "剪纸", "电脑", "饺子"]

# 初始化统计数据
stats = {category: {"TP": 0, "FP": 0, "FN": 0} for category in categories}

# 图片文件夹路径
image_folder_path = "/content/drive/MyDrive/picture_data"

# 设定阈值
threshold = 0.11

# 初始化用于计算平均的变量
total_precision = 0
total_recall = 0
total_f1 = 0

# 对每个类别分别计算precision和recall
for category in categories:
    # 初始化统计数据
    TP = 0
    FP = 0
    FN = 0

    # 遍历图片
    for image_name in os.listdir(image_folder_path):
        if image_name.endswith(".png"):
            image_path = os.path.join(image_folder_path, image_name)

            # 预处理图片
            image = processor(images=(Image.open(image_path)), return_tensors="pt").to(device)

            # 准备文本
            text_labels = [category]
            # text = clip.tokenize(list(categories.keys()) + ["others"]).to(device)
            text = text_tokenizer(text_labels, return_tensors='pt', padding=True)['input_ids'].to(device)

            # 进行预测
            similarity = cal_similarity(image, text)

            # 使用阈值判断类别
            is_positive_prediction = similarity[0] > threshold
            predicted_category = category if is_positive_prediction else "others"

            # 判断真实类别
            actual_category = category if category in image_name else "others"

            # 更新统计数据
            if predicted_category == actual_category and actual_category == category:
                TP += 1
            elif predicted_category == category and actual_category == "others":
                FP += 1
            elif actual_category == category and predicted_category == "others":
                FN += 1

    # 计算Precision和Recall
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0


    print(f"Category: {category}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")
    total_precision += precision
    total_recall += recall
    total_f1 += f1_score

# 计算并打印平均Precision和Recall
average_precision = total_precision / len(categories)
average_recall = total_recall / len(categories)
average_f1 = total_f1 / len(categories)

print(f"Average Precision: {average_precision:.4f}, Average Recall: {average_recall:.4f}, Average F1: {average_f1:.4f}")



Category: 宠物猫, Precision: 0.9706, Recall: 0.9900, F1 Score: 0.9802
Category: 番茄, Precision: 1.0000, Recall: 0.9600, F1 Score: 0.9796
Category: 剪纸, Precision: 1.0000, Recall: 0.9600, F1 Score: 0.9796
Category: 电脑, Precision: 1.0000, Recall: 0.6250, F1 Score: 0.7692
Category: 饺子, Precision: 0.9213, Recall: 0.9950, F1 Score: 0.9567
Average Precision: 0.9784, Average Recall: 0.9060, Average F1: 0.9331


中英文CLIP类别判断函数

In [None]:
import os
import torch
import clip
import shutil
from PIL import Image
from transformers import BertTokenizer, BertForSequenceClassification, CLIPProcessor, CLIPModel
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize

# 设置设备
device = "cuda" if torch.cuda.is_available() else "cpu"
model_1, processor_1 = clip.load("ViT-B/32", device=device)

text_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese")
text_encoder = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese").eval().to(device)
model_2 = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor_2 = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14", device=device)

def run_english_clip_model(image_name, category):
    image_path = os.path.join(image_folder_path, image_name)
    image = processor_1(Image.open(image_path)).unsqueeze(0).to(device)
    # text = clip.tokenize([category, "others"]).to(device)
    text = clip.tokenize([category]).to(device)

    with torch.no_grad():
        image_features = model_1.encode_image(image)
        text_features = model_1.encode_text(text)

        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        similarity = image_features @ text_features.T

        threshold = 0.24
        is_positive_prediction = similarity[0] > threshold

    return is_positive_prediction


def run_chinese_clip_model(image_name, category):
    image_path = os.path.join(image_folder_path, image_name)
    image = processor_2(images=(Image.open(image_path)), return_tensors="pt").to(device)
    # text = text_tokenizer([category, "其他"], return_tensors='pt', padding=True)['input_ids'].to(device)
    text = text_tokenizer([category], return_tensors='pt', padding=True)['input_ids'].to(device)

    with torch.no_grad():
        image_features = model_2.get_image_features(**image)
        text_features = text_encoder(text).logits
        # 归一化
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)
        # 计算余弦相似度
        logit_scale = model_2.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        # probs = logits_per_image.softmax(dim=-1).cpu().numpy()
        similarity = image_features @ text_features.t()

        threshold = 0.1
        is_positive_prediction = similarity[0] > threshold

    return is_positive_prediction

中英文CLIP结果取并集，并复制分类结果

In [None]:
categories = {
    "pet cat": "宠物猫",
    "tomato": "番茄",
    "paper-cut": "剪纸",
    "computer": "电脑",
    "dumpling": "饺子"
}

image_folder_path = "/content/drive/MyDrive/picture_data"
target_root_folder = "/content/drive/MyDrive/target_data3/"

# 初始化统计数据
total_precision = 0
total_recall = 0
total_f1 = 0

for category in categories.keys():
    # 确保每个类别的目标文件夹存在
    target_folder = os.path.join(target_root_folder, category)
    os.makedirs(target_folder, exist_ok=True)

for category, keyword in categories.items():
    TP = 0
    FP = 0
    FN = 0

    for image_name in os.listdir(image_folder_path):
        if image_name.endswith(".png"):
            # 运行英文CLIP模型
            is_positive_english = run_english_clip_model(image_name, category)
            # 运行中文CLIP模型
            is_positive_chinese = run_chinese_clip_model(image_name, keyword)

            # 判断是否为正样本（英文和中文结果的并集）
            is_positive = is_positive_english or is_positive_chinese

            image_path = os.path.join(image_folder_path, image_name)
            if is_positive:
                target_folder = os.path.join(target_root_folder, category)
                shutil.copy(image_path, target_folder)

    # 获取分类后的图片路径
    target_folder = os.path.join(target_root_folder, category)
    classified_images = os.listdir(target_folder)

    for image_name in classified_images:
        if image_name.endswith(".png"):
            # 判断真实类别
            actual_category = category if keyword in image_name else "其他"

            # 更新统计数据
            if actual_category == category:
                TP += 1
            else:
                FP += 1

    # 计算并打印当前类别的Precision、Recall
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / 200  # 固定正样本总数为200
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    print(f"Category: {category}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")

    # 更新总体统计数据
    total_precision += precision
    total_recall += recall
    total_f1 += f1_score

# 计算并打印平均Precision、Recall和F1分数
average_precision = total_precision / len(categories)
average_recall = total_recall / len(categories)
average_f1 = total_f1 / len(categories)
print(f"Average Precision: {average_precision:.4f}, Average Recall: {average_recall:.4f}, Average F1: {average_f1:.4f}")


Category: pet cat, Precision: 0.9390, Recall: 1.0000, F1 Score: 0.9685
Category: tomato, Precision: 0.7082, Recall: 0.9950, F1 Score: 0.8274
Category: paper-cut, Precision: 0.8615, Recall: 0.9950, F1 Score: 0.9234
Category: computer, Precision: 0.9635, Recall: 0.9250, F1 Score: 0.9439
Category: dumpling, Precision: 0.4246, Recall: 1.0000, F1 Score: 0.5961
Average Precision: 0.7794, Average Recall: 0.9830, Average F1: 0.8519
