In [1]:
import pandas as pd
import os

### Data preparing


In [2]:
LABELS_PATH = (
    "../../../data/open_image_dataset/meta/{}-annotations-human-imagelabels-boxable.csv"
)
labels_df = pd.concat(
    (
        pd.read_csv(LABELS_PATH.format("train")),
        pd.read_csv(LABELS_PATH.format("test")),
        pd.read_csv(LABELS_PATH.format("validation")),
    )
)

In [3]:
LABELS_NAMES_PATH = "../../../data/open_image_dataset/meta/oidv6-class-descriptions.csv"

labels_names_df = pd.read_csv(LABELS_NAMES_PATH)

In [4]:
BOXES_PATH = "../../../data/open_image_dataset/meta/oidv6-{}-annotations-bbox.csv"
boxes_df = pd.concat(
    (
        pd.read_csv(BOXES_PATH.format("train")),
        pd.read_csv(BOXES_PATH.format("test")),
        pd.read_csv(BOXES_PATH.format("validation")),
    )
)

In [5]:
IMAGES_LIST_FILE = "../../../data/open_image_dataset/images.txt"

with open(IMAGES_LIST_FILE, "r") as file:
    data = file.readlines()

data = list(map(lambda x: x.strip(), data))
images_ids = list(map(lambda x: x.split("/")[1], data))

In [6]:
id_name_df = pd.merge(
    labels_df[["ImageID", "LabelName"]], labels_names_df, on="LabelName"
)[["ImageID", "DisplayName"]]

id_name_df = id_name_df[id_name_df["ImageID"].isin(images_ids)]

id_name_df

Unnamed: 0,ImageID,DisplayName
0,000002b66c9c498e,Apple
1,000002b66c9c498e,Human eye
2,000002b66c9c498e,Beer
3,000002b66c9c498e,Bird
4,000002b66c9c498e,Cucumber
...,...,...
10026222,fff149d613bacda0,Toy
10026223,fff149d613bacda0,Human nose
10026224,fff149d613bacda0,Candy
10026225,fff149d613bacda0,Human mouth


In [7]:
df = pd.merge(
    id_name_df,
    boxes_df[["ImageID", "XMin", "XMax", "YMin", "YMax"]],
    on="ImageID",
    how="inner",
)

df = df.rename(columns={"ImageID": "image_id", "DisplayName": "class_name"})
df["bbox"] = df.apply(lambda r: [r["XMin"], r["YMin"], r["XMax"], r["YMax"]], axis=1)
df = df[["image_id", "class_name", "bbox"]]

In [8]:
df

Unnamed: 0,image_id,class_name,bbox
0,000002b66c9c498e,Apple,"[0.0125, 0.148438, 0.195312, 0.5875]"
1,000002b66c9c498e,Apple,"[0.025, 0.714063, 0.276563, 0.948438]"
2,000002b66c9c498e,Apple,"[0.151562, 0.198437, 0.310937, 0.590625]"
3,000002b66c9c498e,Apple,"[0.25625, 0.651563, 0.429688, 0.925]"
4,000002b66c9c498e,Apple,"[0.257812, 0.235938, 0.346875, 0.385938]"
...,...,...,...
1315907,fff149d613bacda0,Candy,"[0.0, 0.0, 0.9985251, 0.99557525]"
1315908,fff149d613bacda0,Human mouth,"[0.0, 0.0, 0.9985251, 0.99557525]"
1315909,fff149d613bacda0,Human mouth,"[0.0, 0.0, 0.9985251, 0.99557525]"
1315910,fff149d613bacda0,Human head,"[0.0, 0.0, 0.9985251, 0.99557525]"


In [9]:
IMAGES_DIR = "../../data/oid/images"

df["image_path"] = df["image_id"].apply(lambda x: os.path.join(IMAGES_DIR, x + ".jpg"))
df = df.drop("image_id", axis=1)

In [None]:
df["class"], _ = pd.factorize(df["class_name"])
df = df[["class", "class_name", "image_path", "bbox"]]

# unique images
df = df.groupby("image_path", as_index=False).first()
df = df[["class", "class_name", "image_path", "bbox"]]

In [11]:
# classes filtering
min_quantile = 0.3
max_quantile = 0.95
save_raw_counters = True
save_filtered_counters = False

counter_df = df["class_name"].value_counts().reset_index()

if save_raw_counters:
    counter_df.to_csv("tmp/all_classes.csv", index=False)

counter_df = counter_df[
    (counter_df["count"] > counter_df["count"].quantile(min_quantile))
    & (counter_df["count"] < counter_df["count"].quantile(max_quantile))
]

if save_filtered_counters:
    counter_df.to_csv("tmp/accepted_classes.csv", index=False)

In [12]:
# after manual filtering
counter_df = pd.read_csv("tmp/accepted_classes.csv")
df = df[df["class_name"].isin(counter_df["class_name"])]

In [13]:
df

Unnamed: 0,class,class_name,image_path,bbox
0,0,Apple,../../data/oid/images/000002b66c9c498e.jpg,"[0.0125, 0.148438, 0.195312, 0.5875]"
1,0,Apple,../../data/oid/images/000002b66c9c498e.jpg,"[0.025, 0.714063, 0.276563, 0.948438]"
2,0,Apple,../../data/oid/images/000002b66c9c498e.jpg,"[0.151562, 0.198437, 0.310937, 0.590625]"
3,0,Apple,../../data/oid/images/000002b66c9c498e.jpg,"[0.25625, 0.651563, 0.429688, 0.925]"
4,0,Apple,../../data/oid/images/000002b66c9c498e.jpg,"[0.257812, 0.235938, 0.346875, 0.385938]"
...,...,...,...,...
1315891,224,Shrimp,../../data/oid/images/ffdf3b4d9237c25d.jpg,"[0.021875, 0.05625, 0.946875, 0.8125]"
1315892,74,Fish,../../data/oid/images/ffdf3b4d9237c25d.jpg,"[0.0203125, 0.05625, 0.9453125, 0.81041664]"
1315893,74,Fish,../../data/oid/images/ffdf3b4d9237c25d.jpg,"[0.021875, 0.05625, 0.946875, 0.8125]"
1315906,118,Candy,../../data/oid/images/fff149d613bacda0.jpg,"[0.0, 0.0, 0.9985251, 0.99557525]"


In [14]:
df.to_parquet("../../../data/open_image_dataset/train.parquet")