In [None]:
from zod import ZodFrames
import zod.constants as constants
from zod.constants import AnnotationProject, Anonymization

import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
import plotly.express as px
from typing import List

import os
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, parent_dir)

from utils.dataset_transformer import convert_zod_to_pandas

dataset_root = "../data/zod"
version = "full" 

In [None]:
zod_frames = ZodFrames(dataset_root=dataset_root, version=version)
training_frame_ids = zod_frames.get_split(constants.TRAIN)
validation_frame_ids = zod_frames.get_split(constants.VAL)

print(f"Number of training frames: {len(training_frame_ids)}")
print(f"Number of validation frames: {len(validation_frame_ids)}")

print("The 5 first training frames have the ids:", sorted(list(training_frame_ids))[:5])

In [None]:
df_train, objects_train = convert_zod_to_pandas(
    ids=training_frame_ids,
    zod_dataset=zod_frames,
    name="train",
    load_buffer_if_available=False,
)
print(f"Train dataset size {len(df_train)}")
df_val, objects_val = convert_zod_to_pandas(
    ids=validation_frame_ids,
    zod_dataset=zod_frames,
    name="val",
    load_buffer_if_available=False,
)
print(f"Val dataset size {len(df_val)}")

In [None]:
field = "country_code"
df_count = df_train.groupby([field]).size().reset_index(name="Count")
fig = px.pie(df_count, names=field, values="Count", 
             labels={field: field, "Count": "Count"},
             title=f"Distribution of {field} in Train Data")
fig.update_layout(height=800) 
fig.show()

In [None]:
def plot_categories(df_combined, categorical_fields):
    for field in categorical_fields:
        df_count = df_combined.groupby([field, "dataset"]).size().reset_index(name="Count")
        fig = px.bar(df_count, x=field, y="Count", color="dataset", barmode="stack",
                     labels={field: field, "Count": "Count"},
                     title=f"Distribution of {field} in Train and Validation Data")
        fig.show()

    if "solar_angle_elevation" in df_combined.columns:
        fig = px.histogram(df_combined, x="solar_angle_elevation", color="dataset", barmode="overlay",
                           title="Distribution of Solar Angle Elevation in Train and Validation Data",
                           labels={"solar_angle_elevation": "Solar Angle Elevation"})
        fig.show()

df_train["dataset"] = "train"
df_val["dataset"] = "val"
df_combined = pd.concat([df_train, df_val], ignore_index=True)

categorical_fields = ["country_code"]#, "road_condition", "road_type", "scraped_weather", "time_of_day"]
plot_categories(df_combined, categorical_fields)