# Mask R-CNN

### при попытке обучить модель постоянно упиралась в ограничение по памяти, что не решалось изменением чего-либо. модель лишь раз прошла одну эпоху, но файл тогда сохранить не удалось по причине автоматического рестарта среды. эксперту об этом говорили, но все же решено хотя бы прикрепить ноутбук. изначально вера в данную архитектуру была сильной.

## установка и импорт необходимых зависимостей

In [None]:
!pip install pycocotools --q
!pip install torch torchvision --q

In [None]:
import os
import json
from pathlib import Path

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from torchvision.models.detection import MaskRCNN
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
from pycocotools.coco import COCO
from PIL import Image, ImageDraw

## конвертирование исходного датасета в формат COCO

In [None]:
def convert_to_coco_format(json_folder, image_folder, output_path):
    coco_format = {
        "images": [],
        "annotations": [],
        "categories": []
    }
    category_mapping = {
        "table": 1,
        "title": 2,
        "paragraph": 3,
        "formula": 4,
        "header": 5,
        "footer": 6,
        "footnote": 7,
        "numbered_list": 8,
        "marked_list": 9,
        "table_signature": 10,
        "picture_signature": 11,
        "picture": 12
    }
    
    coco_format["categories"] = [{"id": v, "name": k} for k, v in category_mapping.items()]
    
    annotation_id = 1
    for json_file in Path(json_folder).glob("*.json"):
        with open(json_file, 'r') as f:
            data = json.load(f)
        
        image_path = os.path.join(image_folder, os.path.basename(data["image_path"]))
        image_id = int(os.path.splitext(os.path.basename(image_path))[0].split('_')[-1])
        
        image_height = data["image_height"]
        image_width = data["image_width"]
        
        coco_format["images"].append({
            "id": image_id,
            "file_name": os.path.basename(image_path),
            "height": image_height,
            "width": image_width
        })
        
        for category, annotations in data.items():
            if category in category_mapping and annotations:
                for bbox in annotations:
                    x_min, y_min, x_max, y_max = bbox
                    width = x_max - x_min
                    height = y_max - y_min

                    if width <= 0 or height <= 0:
                        print(f"Invalid box in {json_file}: {bbox}")
                        continue

                    coco_format["annotations"].append({
                        "id": annotation_id,
                        "image_id": image_id,
                        "category_id": category_mapping[category],
                        "bbox": [x_min, y_min, width, height],
                        "area": width * height,
                        "iscrowd": 0
                    })
                    annotation_id += 1

    with open(output_path, "w") as f:
        json.dump(coco_format, f, indent=4)

In [None]:
json_folder = "/kaggle/input/own-data-caselab/json/json"
image_folder = "/kaggle/input/own-data-caselab/image/image"
output_path = "/kaggle/working/coco_annotations.json"
convert_to_coco_format(json_folder, image_folder, output_path)

## определение класса для создания датасета

In [None]:
class CustomCocoDataset(Dataset):
    def __init__(self, coco_file, image_folder, transforms=None):
        self.coco = COCO(coco_file)
        self.image_folder = image_folder
        self.transforms = transforms
        self.image_ids = list(self.coco.imgs.keys())
    
    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        image_info = self.coco.loadImgs(image_id)[0]
        image = Image.open(os.path.join(self.image_folder, image_info["file_name"])).convert("RGB")
        
        ann_ids = self.coco.getAnnIds(imgIds=image_id)
        annotations = self.coco.loadAnns(ann_ids)
        
        boxes = []
        labels = []
        masks = []
        for ann in annotations:
            x_min, y_min, width, height = ann["bbox"]
            x_max = x_min + width
            y_max = y_min + height
    
            if width <= 0 or height <= 0:
                print(f"Invalid box {ann['bbox']} in image {image_info['file_name']}")
                continue
    
            boxes.append([x_min, y_min, x_max, y_max])
            labels.append(ann["category_id"])
            
            mask = Image.new('L', (int(image_info['width']), int(image_info['height'])), 0)
            draw = ImageDraw.Draw(mask)
            draw.rectangle([x_min, y_min, x_max, y_max], fill=1)
            masks.append(np.array(mask))
    
        if self.transforms:
            image = self.transforms(image)
        
        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64),
            "masks": torch.tensor(masks, dtype=torch.uint8),
            "image_id": torch.tensor([image_id])
        }
        return image, target

    
    def __len__(self):
        return len(self.image_ids)

## подготовка данных и модели

In [None]:
transforms = T.Compose([T.ToTensor()])
dataset = CustomCocoDataset(output_path, image_folder, transforms)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

In [None]:
backbone = resnet_fpn_backbone('resnet50', pretrained=True)
model = MaskRCNN(backbone, num_classes=13) 

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
model.train()
num_epochs = 10

## цикл обучения модели

In [None]:
for epoch in range(num_epochs):
    epoch_loss = 0
    for images, targets in data_loader:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        epoch_loss += losses.item()

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}")

## сохранение модели

In [None]:
torch.save(model.state_dict(), "maskrcnn_custom.pth")