In [7]:
import os
import json
import cv2

In [4]:
# 读取 JSON 文件中的标注信息
def read_annotation(annotation_path):
    with open(annotation_path, 'r') as f:
        annotation = json.load(f)
    return annotation

# 从 JSON 中提取目标信息
def extract_objects(annotations):
    objects = []
    for obj in annotations.get("objects", []):
        class_title = obj.get("classTitle")
        points = obj.get("points", {}).get("exterior", [])

        if class_title and points:
            # 通过外部坐标生成边界框
            xmin = min([p[0] for p in points])
            xmax = max([p[0] for p in points])
            ymin = min([p[1] for p in points])
            ymax = max([p[1] for p in points])

            objects.append({
                "class": class_title,
                "bbox": [xmin, ymin, xmax, ymax]
            })
    return objects

# 预处理图像：读取、调整大小、归一化
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    return image

# 生成数据集
def create_dataset(image_folder, annotation_folder):
    # 获取图像文件夹中所有图像文件路径
    image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.endswith('.jpg') or f.endswith('.png')]
    dataset = []

    for image_path in image_paths:
        # 获取对应的标注文件路径
        image_name = os.path.splitext(os.path.basename(image_path))[0]
        json_path = os.path.join(annotation_folder, image_name + '.jpg.json')  # 正确的标注文件路径

        if not os.path.exists(json_path):
            print(f"警告：找不到标注文件：{json_path}")
            continue

        # 读取标注数据
        annotations = read_annotation(json_path)
        objects = extract_objects(annotations)

        # 预处理图像
        image = preprocess_image(image_path)

        # 获取目标边界框和类别标签
        bboxes = []
        labels = []
        for obj in objects:
            bboxes.append(obj["bbox"])
            labels.append(obj["class"])

        # 将图像、边界框和标签组合成数据
        dataset.append({
            "image": image,
            "bboxes": bboxes,
            "labels": labels
        })

    return dataset

def get_unique_classes(dataset):
    all_classes = set()
    for data in dataset:
        for label in data['labels']:
            all_classes.add(label)
    return list(all_classes)

In [5]:
# 示例使用
train_dir = r"laboro-tomato-DatasetNinja\Train"
test_dir = r"laboro-tomato-DatasetNinja\Test"

train_annotation_path = os.path.join(train_dir, "ann")
train_images_path = os.path.join(train_dir, "img")

val_annotation_path = os.path.join(test_dir, "ann")
val_images_path = os.path.join(test_dir, "img")

# 创建训练和验证数据集
train_dataset = create_dataset(train_images_path, train_annotation_path)
val_dataset = create_dataset(val_images_path, val_annotation_path)

# 所有类别
train_classes = get_unique_classes(train_dataset)
num_classes = len(train_classes)

print(f"训练集类别数量：{num_classes}")
print(f"所有类别：{train_classes}")

print(f"训练集包含 {len(train_dataset)} 个样本")
print(f"第1个样本的图像形状：{train_dataset[0]['image'].shape}")
print(f"第1个样本的边界框数量：{len(train_dataset[0]['bboxes'])}")
print(f"第1个样本的标签数量：{len(train_dataset[0]['labels'])}")

print(f"验证集包含 {len(val_dataset)} 个样本")
print(f"第1个样本的图像形状：{val_dataset[0]['image'].shape}")
print(f"第1个样本的边界框数量：{len(val_dataset[0]['bboxes'])}")

训练集类别数量：6
所有类别：['b_half_ripened', 'b_green', 'b_fully_ripened', 'l_green', 'l_half_ripened', 'l_fully_ripened']
训练集包含 153 个样本
第1个样本的图像形状：(4032, 3024, 3)
第1个样本的边界框数量：6
第1个样本的标签数量：6
验证集包含 59 个样本
第1个样本的图像形状：(4032, 3024, 3)
第1个样本的边界框数量：9


# yolo

In [6]:
# 获取类别 ID
def get_class_id(class_name, class_list):
    if class_name not in class_list:
        class_list.append(class_name)
    return class_list.index(class_name)

# 将目标转换为 YOLO 格式
def convert_to_yolo_format(image, bboxes, labels, class_list):
    img_height, img_width, _ = image.shape
    yolo_bboxes = []

    for bbox, label in zip(bboxes, labels):
        xmin, ymin, xmax, ymax = bbox
        # 计算边界框的中心坐标和宽高，并进行归一化
        x_center = round((xmin + xmax) / 2.0 / img_width,6)
        y_center = round((ymin + ymax) / 2.0 / img_height,6)
        width = round((xmax - xmin) / img_width,6)
        height = round((ymax - ymin) / img_height,6)

        # 获取类别 ID
        class_id = get_class_id(label, class_list)

        # 格式化为 YOLO 格式
        yolo_bboxes.append(f"{class_id} {x_center} {y_center} {width} {height}")

    return yolo_bboxes

# 生成数据集并保存为 YOLO 标注文件
def create_yolo_dataset(image_folder, annotation_folder, output_folder):
    # 获取图像文件夹中所有图像文件路径
    image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.endswith('.jpg') or f.endswith('.png')]
    class_list = []  # 用于存储所有类别
    for image_path in image_paths:
        # 获取对应的标注文件路径
        image_name = os.path.splitext(os.path.basename(image_path))[0]
        json_path = os.path.join(annotation_folder, image_name + '.jpg.json')  # 正确的标注文件路径

        if not os.path.exists(json_path):
            print(f"警告：找不到标注文件：{json_path}")
            continue

        # 读取标注数据
        annotations = read_annotation(json_path)
        objects = extract_objects(annotations)

        # 预处理图像
        image = preprocess_image(image_path)

        # 获取目标边界框和类别标签
        bboxes = []
        labels = []
        for obj in objects:
            bboxes.append(obj["bbox"])
            labels.append(obj["class"])

        # 转换为 YOLO 格式
        yolo_bboxes = convert_to_yolo_format(image, bboxes, labels, class_list)

        # 保存 YOLO 格式的标注到文件
        yolo_txt_path = os.path.join(output_folder, image_name + '.txt')
        with open(yolo_txt_path, 'w') as f:
            for yolo_bbox in yolo_bboxes:
                f.write(yolo_bbox + '\n')

    # 返回所有类别的列表
    return class_list

# 使用示例
image_folder = r'E:\ipynb\project\cv_exp_rcnn\laboro-tomato-DatasetNinja\Test\img'
annotation_folder = r'E:\ipynb\project\cv_exp_rcnn\laboro-tomato-DatasetNinja\Test\ann'
output_folder = r'E:\ipynb\project\cv_exp_rcnn\laboro-tomato-DatasetNinja\Test\img'

create_yolo_dataset(image_folder, annotation_folder, output_folder)

['b_green',
 'b_half_ripened',
 'b_fully_ripened',
 'l_green',
 'l_fully_ripened',
 'l_half_ripened']

# voc

In [9]:
import os
import json
import cv2
from xml.etree.ElementTree import Element, SubElement, ElementTree

# 创建 Pascal VOC 格式的 XML 文件
def create_voc_xml(image_path, bboxes, labels, output_folder):
    image_name = os.path.basename(image_path)
    img = cv2.imread(image_path)
    img_height, img_width, _ = img.shape
    
    # 创建 XML 根元素
    annotation = Element('annotation')
    
    # 添加文件夹、文件名和图像路径
    folder = SubElement(annotation, 'folder')
    folder.text = 'images'
    
    filename = SubElement(annotation, 'filename')
    filename.text = image_name
    
    path = SubElement(annotation, 'path')
    path.text = image_path
    
    # 添加图像尺寸信息
    size = SubElement(annotation, 'size')
    width = SubElement(size, 'width')
    width.text = str(img_width)
    height = SubElement(size, 'height')
    height.text = str(img_height)
    depth = SubElement(size, 'depth')
    depth.text = '3'  # 3 表示 RGB 图像
    
    # 为每个目标添加标注
    for bbox, label in zip(bboxes, labels):
        obj = SubElement(annotation, 'object')
        
        name = SubElement(obj, 'name')
        name.text = label
        
        bndbox = SubElement(obj, 'bndbox')
        xmin = SubElement(bndbox, 'xmin')
        xmin.text = str(bbox[0])
        ymin = SubElement(bndbox, 'ymin')
        ymin.text = str(bbox[1])
        xmax = SubElement(bndbox, 'xmax')
        xmax.text = str(bbox[2])
        ymax = SubElement(bndbox, 'ymax')
        ymax.text = str(bbox[3])
    
    # 生成 XML 文件并保存
    tree = ElementTree(annotation)
    output_path = os.path.join(output_folder, os.path.splitext(image_name)[0] + '.xml')
    tree.write(output_path)

# 生成数据集并保存为 Pascal VOC 标注文件
def create_voc_dataset(image_folder, annotation_folder, output_folder):
    # 获取图像文件夹中所有图像文件路径
    image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.endswith('.jpg') or f.endswith('.png')]
    
    for image_path in image_paths:
        # 获取对应的标注文件路径
        image_name = os.path.splitext(os.path.basename(image_path))[0]
        json_path = os.path.join(annotation_folder, image_name + '.jpg.json')  # 正确的标注文件路径

        if not os.path.exists(json_path):
            print(f"警告：找不到标注文件：{json_path}")
            continue

        # 读取标注数据
        annotations = read_annotation(json_path)
        objects = extract_objects(annotations)

        # 获取目标边界框和类别标签
        bboxes = []
        labels = []
        for obj in objects:
            bboxes.append(obj["bbox"])
            labels.append(obj["class"])

        # 创建 Pascal VOC 格式的 XML 文件
        create_voc_xml(image_path, bboxes, labels, output_folder)

# 使用示例
image_folder = r'E:\ipynb\project\cv_exp_rcnn\laboro-tomato-DatasetNinja\Test\img'
annotation_folder = r'E:\ipynb\project\cv_exp_rcnn\laboro-tomato-DatasetNinja\Test\ann'
output_folder = r'E:\ipynb\project\cv_exp_rcnn\laboro-tomato-DatasetNinja\Test\voc'

create_voc_dataset(image_folder, annotation_folder, output_folder)


In [9]:
import os
import torch
import torchvision
from torchvision import transforms
from PIL import Image
from torch.utils.data import Dataset
import xml.etree.ElementTree as ET

In [10]:
class VOCDataset(Dataset):
    def __init__(self, image_folder, annotation_folder, transform=None):
        self.image_folder = image_folder
        self.annotation_folder = annotation_folder
        self.transform = transform
        self.image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg') or f.endswith('.png')]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        # 获取图像路径
        image_name = self.image_files[idx]
        image_path = os.path.join(self.image_folder, image_name)
        annotation_path = os.path.join(self.annotation_folder, os.path.splitext(image_name)[0] + '.xml')

        # 读取图像
        image = Image.open(image_path).convert("RGB")

        # 解析标注文件
        boxes, labels = self.parse_annotation(annotation_path)

        # 如果有需要，应用转换
        if self.transform:
            image = self.transform(image)

        # 目标格式化为字典：boxes 是边界框，labels 是类别标签
        target = {
            'boxes': torch.tensor(boxes, dtype=torch.float32),
            'labels': torch.tensor(labels, dtype=torch.int64)
        }
        return image, target

    def parse_annotation(self, annotation_path):
        tree = ET.parse(annotation_path)
        root = tree.getroot()

        boxes = []
        labels = []
        for obj in root.findall('object'):
            # 获取边界框坐标
            xmin = int(obj.find('bndbox').find('xmin').text)
            ymin = int(obj.find('bndbox').find('ymin').text)
            xmax = int(obj.find('bndbox').find('xmax').text)
            ymax = int(obj.find('bndbox').find('ymax').text)
            boxes.append([xmin, ymin, xmax, ymax])

            # 获取类别标签
            labels.append(self.get_class_id(obj.find('name').text))

        return boxes, labels

    def get_class_id(self, class_name):
        # 你可以根据实际数据设置不同的类别
        class_dict = r"E:\ipynb\project\cv_exp_rcnn\laboro-tomato-DatasetNinja\tomato\class.txt"
        return class_dict.get(class_name, 0)  # 默认类别 0



In [11]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((800, 800)),  # 可根据需要调整大小
])

# 创建数据集实例
train_dataset = VOCDataset(image_folder=r"E:\ipynb\project\cv_exp_rcnn\laboro-tomato-DatasetNinja\Train\img", 
                           annotation_folder=r"E:\ipynb\project\cv_exp_rcnn\laboro-tomato-DatasetNinja\Train\voc", 
                           transform=transform)

# 创建数据加载器
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)


In [12]:
import torchvision.models.detection as detection
import torch.optim as optim

# 加载预训练的 Faster R-CNN 模型
model = detection.fasterrcnn_resnet50_fpn(pretrained=True)

# 将模型设置为训练模式
model.train()

# 修改类别数量，通常默认是 91 类，我们假设只有 4 个类别（背景+3个目标类别）
num_classes = 5  # 4 个目标类别 + 1 背景
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

# 定义优化器
params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# 设置学习率调度器
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)




In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # 训练模式
    running_loss = 0.0
    for images, targets in train_loader:
        # 通过 GPU 训练（如果有的话）
        images = [image.to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # 前向传播
        loss_dict = model(images, targets)

        # 损失值
        losses = sum(loss for loss in loss_dict.values())

        # 反向传播
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        running_loss += losses.item()

    # 更新学习率
    lr_scheduler.step()

    # 输出每个 epoch 的损失
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")


RuntimeError: stack expects each tensor to be equal size, but got [8, 4] at entry 0 and [20, 4] at entry 1

In [14]:
# 保存模型
torch.save(model.state_dict(), "fasterrcnn.pth")

# 加载模型
model.load_state_dict(torch.load("fasterrcnn.pth"))
model.eval()


  model.load_state_dict(torch.load("fasterrcnn.pth"))


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(