In [None]:
# dataset: https://storage.googleapis.com/openimages/web/factsfigures_v7.html

import pandas as pd
from os.path import join

### Params


In [2]:
WORK_DIR = "../.."
LABELS_PATH = join(
    WORK_DIR,
    "data/open_image_dataset/meta/{}-annotations-human-imagelabels-boxable.csv",
)
LABELS_NAMES_PATH = join(
    WORK_DIR, "data/open_image_dataset/meta/oidv6-class-descriptions.csv"
)
BOXES_PATH = join(
    WORK_DIR, "data/open_image_dataset/meta/oidv6-{}-annotations-bbox.csv"
)
IMAGES_LIST_FILE = join(WORK_DIR, "data/open_image_dataset/images.txt")
TRAIN_IMAGES_DIR = join(WORK_DIR, "data/oid/images")
ALL_CLASSES_TABLE = join(WORK_DIR, "data/classes/detection/all_classes.csv")
ACCEPTED_CLASSES_TABLE = join(WORK_DIR, "data/classes/detection/accepted_classes.csv")
FINAL_TABLE = join(WORK_DIR, "data/open_image_dataset/train.parquet")

### Loading


In [3]:
# load labels
labels_df = pd.concat(
    (
        pd.read_csv(LABELS_PATH.format("train")),
        pd.read_csv(LABELS_PATH.format("test")),
        pd.read_csv(LABELS_PATH.format("validation")),
    )
)

In [4]:
# load labels names
labels_names_df = pd.read_csv(LABELS_NAMES_PATH)

In [5]:
# load boxes
boxes_df = pd.concat(
    (
        pd.read_csv(BOXES_PATH.format("train")),
        pd.read_csv(BOXES_PATH.format("test")),
        pd.read_csv(BOXES_PATH.format("validation")),
    )
)

In [6]:
# load images list
with open(IMAGES_LIST_FILE, "r") as file:
    data = file.readlines()

data = list(map(lambda x: x.strip(), data))
images_ids = list(map(lambda x: x.split("/")[1], data))

### Preprocessing


In [7]:
# merge images and names
id_name_df = pd.merge(
    labels_df[["ImageID", "LabelName"]], labels_names_df, on="LabelName"
)[["ImageID", "DisplayName"]]

id_name_df = id_name_df[id_name_df["ImageID"].isin(images_ids)]

id_name_df

Unnamed: 0,ImageID,DisplayName
0,000002b66c9c498e,Apple
1,000002b66c9c498e,Human eye
2,000002b66c9c498e,Beer
3,000002b66c9c498e,Bird
4,000002b66c9c498e,Cucumber
...,...,...
10026222,fff149d613bacda0,Toy
10026223,fff149d613bacda0,Human nose
10026224,fff149d613bacda0,Candy
10026225,fff149d613bacda0,Human mouth


In [8]:
# merge boxes
df = pd.merge(
    id_name_df,
    boxes_df[["ImageID", "XMin", "XMax", "YMin", "YMax"]],
    on="ImageID",
    how="inner",
)

df = df.rename(columns={"ImageID": "image_id", "DisplayName": "class_name"})
df["bbox"] = df.apply(lambda r: [r["XMin"], r["YMin"], r["XMax"], r["YMax"]], axis=1)
df = df[["image_id", "class_name", "bbox"]]

In [9]:
df

Unnamed: 0,image_id,class_name,bbox
0,000002b66c9c498e,Apple,"[0.0125, 0.148438, 0.195312, 0.5875]"
1,000002b66c9c498e,Apple,"[0.025, 0.714063, 0.276563, 0.948438]"
2,000002b66c9c498e,Apple,"[0.151562, 0.198437, 0.310937, 0.590625]"
3,000002b66c9c498e,Apple,"[0.25625, 0.651563, 0.429688, 0.925]"
4,000002b66c9c498e,Apple,"[0.257812, 0.235938, 0.346875, 0.385938]"
...,...,...,...
1315907,fff149d613bacda0,Candy,"[0.0, 0.0, 0.9985251, 0.99557525]"
1315908,fff149d613bacda0,Human mouth,"[0.0, 0.0, 0.9985251, 0.99557525]"
1315909,fff149d613bacda0,Human mouth,"[0.0, 0.0, 0.9985251, 0.99557525]"
1315910,fff149d613bacda0,Human head,"[0.0, 0.0, 0.9985251, 0.99557525]"


In [10]:
# normalize paths
df["image_path"] = df["image_id"].apply(lambda x: join(TRAIN_IMAGES_DIR, x + ".jpg"))
df = df.drop("image_id", axis=1)

In [11]:
# numeric classes
df["class"], _ = pd.factorize(df["class_name"])
df = df[["class", "class_name", "image_path", "bbox"]]

# one box per image
df = df.groupby("image_path", as_index=False).first()
df = df[["class", "class_name", "image_path", "bbox"]]

In [12]:
# classes filtering
min_quantile = 0.3
max_quantile = 0.95
save_raw_counters = True
save_filtered_counters = False

counter_df = df["class_name"].value_counts().reset_index()

if save_raw_counters:
    counter_df.to_csv(ALL_CLASSES_TABLE, index=False)

counter_df = counter_df[
    (counter_df["count"] > counter_df["count"].quantile(min_quantile))
    & (counter_df["count"] < counter_df["count"].quantile(max_quantile))
]

if save_filtered_counters:
    counter_df.to_csv(ACCEPTED_CLASSES_TABLE, index=False)

In [13]:
# post manual filtering
counter_df = pd.read_csv(ACCEPTED_CLASSES_TABLE)
df = df[df["class_name"].isin(counter_df["class_name"])]

In [14]:
df

Unnamed: 0,class,class_name,image_path,bbox
0,0,Apple,../../data/oid/images/000002b66c9c498e.jpg,"[0.0125, 0.148438, 0.195312, 0.5875]"
1,0,Apple,../../data/oid/images/0000a16e4b057580.jpg,"[0.00375, 0.268333, 0.433125, 0.759167]"
13,4,Cucumber,../../data/oid/images/000784072165394b.jpg,"[0.10625, 0.810507, 0.27375, 0.998124]"
14,76,Ice cream,../../data/oid/images/000794c94c6d86ea.jpg,"[0.0044247787, 0.19690265, 0.9941003, 0.99336284]"
22,2,Beer,../../data/oid/images/0009d6b2e2f0c698.jpg,"[0.187617, 0.135084, 0.288931, 0.476548]"
...,...,...,...,...
24085,22,Dessert,../../data/oid/images/ffcce9c7edd86436.jpg,"[0.20915, 0.727124, 0.470588, 0.993464]"
24086,4,Cucumber,../../data/oid/images/ffd3ee74971bda07.jpg,"[0.370313, 0.520833, 0.717188, 0.810417]"
24090,244,Crab,../../data/oid/images/ffdf3b4d9237c25d.jpg,"[0.0203125, 0.05625, 0.9453125, 0.81041664]"
24092,62,Bread,../../data/oid/images/ffe04404f471a3e2.jpg,"[0.0, 0.00907441, 1.0, 1.0]"


In [15]:
df.to_parquet(FINAL_TABLE)  # rename this to train_remote