In [2]:
import os
import cv2
import torch
import glob
import random
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
import torchvision.transforms as T
from tqdm import tqdm



In [3]:
def parse_seqinfo(seqinfo_path):
    """
    Reads MOT17 seqinfo.ini to get info on frame rate, image size, and number of frames.
    """
    info = {}
    with open(seqinfo_path, "r") as f:
        for line in f:
            line = line.strip()
            if "=" in line:
                key, value = line.split("=")
                info[key.strip()] = value.strip()
    return info

In [4]:
def parse_gt_file(gt_txt_path, keep_classes=[1, 7]):
    """
    Reads gt.txt and returns a dictionary of {frame_index: [list_of_boxes]}.
    Each box is a dict of {xmin, ymin, width, height, class_id, etc.}
    We'll keep only the chosen classes, e.g. pedestrians.
    """
    frame_dict = {}
    with open(gt_txt_path, "r") as f:
        for line in f:
            vals = line.strip().split(",")
            frame_id = int(vals[0])
            obj_id = int(vals[1])
            x = int(vals[2])
            y = int(vals[3])
            w = int(vals[4])
            h = int(vals[5])
            conf = float(vals[6])   # for ground truth, often 1 or 0
            cls_id = int(vals[7])   # class label
            # visibility = float(vals[8])  # optional

            # Filter only pedestrians if that’s your target
            if cls_id in keep_classes:
                # Convert from float to int if you prefer
                box_info = {
                    "obj_id": obj_id,
                    "bbox": [x, y, w, h],
                    "class_id": 1  # unify pedestrian to class "1"
                }
                if frame_id not in frame_dict:
                    frame_dict[frame_id] = []
                frame_dict[frame_id].append(box_info)
    return frame_dict


In [5]:
class MOT17PedestrianDataset(Dataset):
    def __init__(self, 
                 images_dir, 
                 seqinfo, 
                 gt_dict, 
                 transforms=None, 
                 resize=(640, 360)):
        """
        images_dir: path to 'img1' folder
        seqinfo: dictionary returned by parse_seqinfo
        gt_dict: dictionary {frame: list of boxes}
        transforms: custom transforms or augmentations
        resize: (width, height) for downscaling images
        """
        self.images_dir = images_dir
        self.gt_dict = gt_dict
        self.transforms = transforms
        self.resize = resize
        
        # seqLength in seqinfo is total frames
        self.num_frames = int(seqinfo.get("seqLength", 0))
        
        # Frame names are 1-based in MOT. We'll store their paths in a list
        self.image_paths = []
        for frame_idx in range(1, self.num_frames+1):
            # Format 6-digit file name, e.g. 000001.jpg
            filename = f"{frame_idx:06d}.jpg"
            full_path = os.path.join(images_dir, filename)
            if os.path.exists(full_path):
                self.image_paths.append(full_path)
            else:
                # Some sequences might skip frames, so just skip if not found
                pass
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        # Load the image
        img_path = self.image_paths[idx]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Determine frame index from file name
        frame_number = int(os.path.splitext(os.path.basename(img_path))[0])
        
        # Retrieve bounding boxes from gt_dict
        boxes_info = self.gt_dict.get(frame_number, [])
        
        boxes = []
        labels = []
        for b in boxes_info:
            x, y, w, h = b["bbox"]
            xmin, ymin = x, y
            xmax, ymax = x + w, y + h
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(b["class_id"])  # 1 for pedestrian
        
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        
        # Create target dictionary
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = torch.tensor([idx])
        
        # Resizing for faster training
        if self.resize:
            # Use a simple cv2 resize or something more advanced
            image = cv2.resize(image, self.resize)
            
            # We need to rescale bounding boxes accordingly
            original_h, original_w = cv2.imread(img_path).shape[:2]  # or store earlier
            new_w, new_h = self.resize

            scale_x = new_w / original_w
            scale_y = new_h / original_h
            
            boxes[:, [0, 2]] *= scale_x
            boxes[:, [1, 3]] *= scale_y
        
        # Convert from NumPy to PIL before torchvision transforms
        image_pil = Image.fromarray(image)
        
        if self.transforms:
            image_pil, target = self.transforms(image_pil, target)
        
        return image_pil, target
    
class ToTensorTransform:
    def __call__(self, image, target=None):
        """
        Converts image to tensor and keeps target unchanged.
        """
        image = F.to_tensor(image)  # Convert image to tensor
        return (image, target) if target else image  # Preserve the target dict

def get_transform(train=True):
    transforms_list = [ToTensorTransform()]  # Only use transforms that accept (image, target)
    return transforms_list[0]  # No need for Compose since we're handling multiple args manually




In [6]:
def get_faster_rcnn_model(num_classes=2):
    # Load a model pre-trained on COCO
    model = fasterrcnn_resnet50_fpn(pretrained=True)
    # Get the number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # Replace the head with a new one (2 classes: background + pedestrian)
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(
        in_features, num_classes
    )
    return model

In [7]:

def train_one_epoch(model, optimizer, data_loader, device):
    model.train()
    total_loss = 0.0

    # Wrap the data_loader with tqdm for progress bar
    progress_bar = tqdm(data_loader, desc="Training", leave=False)

    for images, targets in progress_bar:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Backpropagation
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        total_loss += losses.item()

        # Update progress bar description with loss
        progress_bar.set_postfix(loss=losses.item())

    return total_loss / len(data_loader)



In [None]:
# Suppose you have your MOT17 path
data_dir = "mot17/MOT17Det/train/MOT17-02"
seqinfo_path = os.path.join(data_dir, "seqinfo.ini")
gt_txt_path = os.path.join(data_dir, "gt", "gt.txt")
images_dir = os.path.join(data_dir, "img1")

# Parse data
seqinfo = parse_seqinfo(seqinfo_path)
gt_dict = parse_gt_file(gt_txt_path, keep_classes=[1, 7])  # keep pedestrian classes

# Create dataset
dataset = MOT17PedestrianDataset(
    images_dir=images_dir,
    seqinfo=seqinfo,
    gt_dict=gt_dict,
    transforms=get_transform(train=True),
    resize=(640, 360)  # example: scale down images for faster training
)

# Split into train/val if you want
# For demonstration, let's do a quick small train set
indices = list(range(len(dataset)))
random.shuffle(indices)
train_indices = indices[:int(0.8 * len(indices))]
val_indices   = indices[int(0.8 * len(indices)):]

train_dataset = torch.utils.data.Subset(dataset, train_indices)
val_dataset   = torch.utils.data.Subset(dataset, val_indices)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: list(zip(*x)))
val_loader   = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: list(zip(*x)))

# Get model
device = torch.device('mps' if torch.mps.is_available() else 'cpu')
model = get_faster_rcnn_model(num_classes=2)
model.to(device)

# Set up optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# Train for a few epochs
num_epochs = 5
for epoch in range(num_epochs):
    loss_train = train_one_epoch(model, optimizer, train_loader, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss_train:.4f}")
    
    # You could add a validation pass here...


Training:  10%|█         | 24/240 [54:27<10:07:11, 168.66s/it, loss=1.36] 