In [None]:
import json
import pandas as pd

# Load labels
with open("bdd100k_labels_images_train.json") as f:
    data = json.load(f)

# Flatten into a DataFrame
records = []
for item in data:
    name = item["name"]
    for label in item["labels"]:
        category = label.get("category", None)
        attributes = label.get("attributes", {})
        box = label.get("box2d", None)
        
        records.append({
            "image": name,
            "category": category,
            "occluded": attributes.get("occluded"),
            "truncated": attributes.get("truncated"),
            "crowd": attributes.get("crowd"),
            "x1": box["x1"] if box else None,
            "y1": box["y1"] if box else None,
            "x2": box["x2"] if box else None,
            "y2": box["y2"] if box else None,
        })

df = pd.DataFrame(records)
df.head()


In [None]:
print(df.describe(include="all"))

# Class distribution
print(df["category"].value_counts())

# Average bounding box width/height
df["width"] = df["x2"] - df["x1"]
df["height"] = df["y2"] - df["y1"]
print(df[["width", "height"]].describe())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Category distribution
plt.figure(figsize=(12,6))
sns.countplot(x="category", data=df, order=df["category"].value_counts().index)
plt.xticks(rotation=45)
plt.title("Object Category Distribution")
plt.show()

# Bounding box sizes
plt.figure(figsize=(10,5))
sns.histplot(df["width"], bins=50, kde=True, color="blue", label="Width")
sns.histplot(df["height"], bins=50, kde=True, color="red", label="Height")
plt.legend()
plt.title("Distribution of Bounding Box Sizes")
plt.show()

# Correlation heatmap
plt.figure(figsize=(6,4))
sns.heatmap(df[["width","height","occluded","truncated","crowd"]].corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
# New features
df["area"] = df["width"] * df["height"]
df["aspect_ratio"] = df["width"] / (df["height"] + 1e-6)  # avoid div by zero

# Encode categories
df_encoded = pd.get_dummies(df, columns=["category"])

# Scale numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_cols = ["width", "height", "area", "aspect_ratio"]
df_scaled = df.copy()
df_scaled[num_cols] = scaler.fit_transform(df[num_cols])

df_scaled.head()


In [None]:
obj_count = df.groupby("image")["category"].count()

plt.figure(figsize=(10,5))
sns.histplot(obj_count, bins=50, kde=False)
plt.title("Distribution of Object Counts per Image")
plt.xlabel("Objects per Image")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x="category", hue="occluded", data=df, order=df["category"].value_counts().index)
plt.xticks(rotation=45)
plt.title("Occlusion by Category")
plt.show()

plt.figure(figsize=(12,6))
sns.countplot(x="category", hue="truncated", data=df, order=df["category"].value_counts().index)
plt.xticks(rotation=45)
plt.title("Truncation by Category")
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x="category", y="area", data=df)
plt.yscale("log")
plt.xticks(rotation=45)
plt.title("Bounding Box Area Distribution per Category")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.kdeplot(data=df, x="aspect_ratio", hue="category", common_norm=False, fill=True, alpha=0.3)
plt.xlim(0, 5)
plt.title("Aspect Ratio Distribution by Category")
plt.show()

In [None]:
import numpy as np

heatmap, xedges, yedges = np.histogram2d(df["x1"], df["y1"], bins=(50,50))

plt.figure(figsize=(8,6))
plt.imshow(heatmap.T, origin="lower", cmap="hot", interpolation="nearest")
plt.colorbar(label="Frequency")
plt.title("Heatmap of Object Top-Left Corner Locations")
plt.xlabel("X position")
plt.ylabel("Y position")
plt.show()

In [None]:
sns.pairplot(df[["width","height","area","aspect_ratio"]].sample(2000))  # sample for speed
plt.suptitle("Pairplot of Engineered Features", y=1.02)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x="width", y="height", hue="category", alpha=0.3, data=df.sample(10000))
plt.title("Width vs Height of Bounding Boxes by Category")
plt.show()