In [None]:
from zod import ZodFrames
import zod.constants as constants
from zod.constants import AnnotationProject, Anonymization

import pandas as pd
import plotly.express as px
from typing import List

dataset_root = "../data/zod"
version = "full" 


In [None]:
zod_frames = ZodFrames(dataset_root=dataset_root, version=version)
training_frame_ids = zod_frames.get_split(constants.TRAIN)
validation_frame_ids = zod_frames.get_split(constants.VAL)

print(f"Number of training frames: {len(training_frame_ids)}")
print(f"Number of validation frames: {len(validation_frame_ids)}")

print("The 5 first training frames have the ids:", sorted(list(training_frame_ids))[:5])

In [None]:
def convert_zod_to_pandas(ids, zod_dataset):
    
    data_list = []
    for frame_id in ids:
        frame = zod_dataset[frame_id]
        data_list.append(
            {
                "frame_id": frame.metadata.frame_id,
                "country_code": frame.metadata.country_code,
                "road_condition": frame.metadata.road_condition,
                "road_type": frame.metadata.road_type,
                "scraped_weather": frame.metadata.scraped_weather,
                "time_of_day": frame.metadata.time_of_day,
            }
        )
    return pd.DataFrame(data_list)

In [None]:
df_train = convert_zod_to_pandas(training_frame_ids, zod_frames)
print(f"Train dataset size {len(df_train)}")
df_val = convert_zod_to_pandas(validation_frame_ids, zod_frames)
print(f"Val dataset size {len(df_val)}")

## Visual statistics - all frames

In [None]:
df_train.head(10)

In [None]:
def plot_categories(df_combined, categorical_fields):
    for field in categorical_fields:
        df_count = df_combined.groupby([field, "dataset"]).size().reset_index(name="Count")
        fig = px.bar(df_count, x=field, y="Count", color="dataset", barmode="stack",
                     labels={field: field, "Count": "Count"},
                     title=f"Distribution of {field} in Train and Validation Data")
        fig.show()

    if "solar_angle_elevation" in df_combined.columns:
        fig = px.histogram(df_combined, x="solar_angle_elevation", color="dataset", barmode="overlay",
                           title="Distribution of Solar Angle Elevation in Train and Validation Data",
                           labels={"solar_angle_elevation": "Solar Angle Elevation"})
        fig.show()

df_train["dataset"] = "train"
df_val["dataset"] = "val"
df_combined = pd.concat([df_train, df_val], ignore_index=True)

categorical_fields = ["country_code", "road_condition", "road_type", "scraped_weather", "time_of_day"]
plot_categories(df_combined, categorical_fields)

## Annotation filtering

### Modifed json conversion code

In [None]:
"""Copied and modified from zod/cli/generate_coco_json.py"""
"""Modifications: 
    - filter frames by countries
"""
"""This module will generate a COCO JSON file from the ZOD dataset."""
import json
import os
from functools import partial
from pathlib import Path
from typing import List, Tuple

import typer
from tqdm.contrib.concurrent import process_map

from zod import ZodFrames
from zod.anno.object import OBJECT_CLASSES, ObjectAnnotation
from zod.constants import AnnotationProject, Anonymization
from zod.data_classes.frame import ZodFrame
from zod.utils.utils import str_from_datetime

# Map classes to categories, starting from 1
CATEGORY_NAME_TO_ID = {cls: i + 1 for i, cls in enumerate(OBJECT_CLASSES)}
OPEN_DATASET_URL = (
    "https://www.ai.se/en/data-factory/datasets/data-factory-datasets/zenseact-open-dataset"
)


def _convert_frame(
    frame: ZodFrame, 
    classes: List[str], 
    anonymization: Anonymization, 
    use_png: bool, 
    allowed_country_codes: List[str]
) -> Tuple[dict, List[dict]]:
    if allowed_country_codes and frame.metadata.country_code not in allowed_country_codes:
        return None, []  # Skip frames not in the allowed countries


    objs: List[ObjectAnnotation] = frame.get_annotation(AnnotationProject.OBJECT_DETECTION)
    camera_frame = frame.info.get_key_camera_frame(anonymization=anonymization)
    file_name = camera_frame.filepath

    if anonymization == Anonymization.ORIGINAL:
        file_name = file_name.replace(Anonymization.BLUR.value, Anonymization.ORIGINAL.value)
    if use_png:
        file_name = file_name.replace(".jpg", ".png")

    image_dict = {
        "id": int(frame.info.id),
        "license": 1,
        "file_name": file_name,
        "height": camera_frame.height,
        "width": camera_frame.width,
        "date_captured": str_from_datetime(frame.info.keyframe_time),
        "metadata" : {
            "country_code": frame.metadata.country_code,
            "latitude": frame.metadata.latitude,
            "longitude": frame.metadata.longitude,
            "road_condition": frame.metadata.road_condition,
            "road_type": frame.metadata.road_type,
            "scraped_weather": frame.metadata.scraped_weather,
            "solar_angle_elevation": frame.metadata.solar_angle_elevation,
            "time_of_day": frame.metadata.time_of_day,
        }
    }
    anno_dicts = [
        {
            # avoid collisions by assuming max 1k objects per frame
            "id": int(frame.info.id) * 1000 + obj_idx,
            "image_id": int(frame.info.id),
            "category_id": CATEGORY_NAME_TO_ID[obj.name],
            "bbox": [round(val, 2) for val in obj.box2d.xywh.tolist()],
            "area": round(obj.box2d.area, 2),
            "iscrowd": obj.subclass == "Unclear",
            "occusion_level": obj.occlusion_level

        }
        for obj_idx, obj in enumerate(objs)
        if obj.name in classes
    ]
    return image_dict, anno_dicts


def generate_coco_json(
    dataset: ZodFrames,
    split: str,
    classes: List[str],
    allowed_country_codes: List[str],
    anonymization: Anonymization,
    use_png: bool,
) -> dict:
    """Generate COCO JSON file from the ZOD dataset."""
    assert split in ["train", "val"], f"Unknown split: {split}"
    frame_infos = [dataset[frame_id] for frame_id in dataset.get_split(split)]
    _convert_frame_w_classes = partial(
        _convert_frame, 
        classes=classes, 
        allowed_country_codes=allowed_country_codes, 
        anonymization=anonymization, 
        use_png=use_png
    )
    results = process_map(
        _convert_frame_w_classes,
        frame_infos,
        desc=f"Converting {split} frames",
        chunksize=50 if dataset._version == "full" else 1,
    )

    image_dicts, all_annos = zip(*results)
    anno_dicts = [anno for annos in all_annos for anno in annos]  # flatten
    coco_json = {
        "images": image_dicts,
        "annotations": anno_dicts,
        "info": {
            "description": "Zenseact Open Dataset",
            "url": OPEN_DATASET_URL,
            "version": dataset._version,  # TODO: add dataset versioning
            "year": 2022,
            "contributor": "ZOD team",
            "date_created": "2022/12/15",
        },
        "licenses": [
            {
                "url": "https://creativecommons.org/licenses/by-sa/4.0/",
                "name": "Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)",
                "id": 1,
            },
        ],
        "categories": [
            {"supercategory": "object", "id": category_id, "name": category_name}
            for category_name, category_id in CATEGORY_NAME_TO_ID.items()
            if category_name in classes
        ],
    }
    return coco_json

### Results

In [None]:
# Cell runnable code (if `zod_frames` variable has been already created)
# Parameters
anonymization = Anonymization.BLUR
classes = ["Vehicle", "Pedestrian", "VulnerableVehicle", "Animal"]
use_png = False
output_dir = os.path.abspath(os.path.join(dataset_root, "coco"))
version = "full" 
allowed_country_codes = []
allowed_country_codes_str = ",".join(allowed_country_codes)
split = "train"
base_name = f"zod_{version}_{anonymization}_{allowed_country_codes_str}"
if use_png:
    base_name += "_png"



coco_json_train = generate_coco_json(
        zod_frames, split="train", 
        classes=classes, 
        allowed_country_codes=allowed_country_codes, 
        anonymization=anonymization, 
        use_png=use_png
    )
with open(os.path.join(output_dir, f"{base_name}_train.json"), "w") as f:
    json.dump(coco_json_train, f)

coco_json_val = generate_coco_json(
    zod_frames, 
    split="val", 
    classes=classes, 
    allowed_country_codes=allowed_country_codes, 
    anonymization=anonymization, 
    use_png=use_png
)
with open(os.path.join(output_dir, f"{base_name}_val.json"), "w") as f:
    json.dump(coco_json_val, f)

## 

## YOLO conversion

In [None]:
from ultralytics.data.converter import convert_coco

convert_coco(labels_dir="../data/coco/annotations/")

## Other, unfinished

In [None]:
# Classes 
classes = ["Vehicle", "VulnerableVehicle", "Pedestrian", "Animal"]
class_ids = {(cls_name, idx) for idx, cls_name in enumerate(classes)}
# Filter 
# Problem: I do not know what are the options. Let's continue without them for now
# occussion_levels = []

# Filter ObjectAnnotation.unclear = False and annotation.occlusion_level == 

for id in training_frame_ids:
    frame = zod_frames[id]
    objs = frame.get_annotation(AnnotationProject.OBJECT_DETECTION)
    anno_list = [
        [class_ids[obj.name],
        obj.xyxy] 
        for obj in objs 
        if (obj.name in classes and obj.occusion_level in occussion_levels)
        ]
    print(id)


## Filter by country

In [None]:
df_train_country = df_train.groupby("country_code")