In [1]:
import os
import json
import cv2
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

HOME = os.getcwd()
os.chdir(HOME)

ann_dir = "./data/annos"
img_dir = "./data/image"

# Dataset

In [2]:
df = []
for ann in os.listdir(ann_dir):
    with open(os.path.join(ann_dir, ann)) as file:
        data = json.load(file)
    for key in data.keys():
        data[key]["id"] = ann
        df.append(data[key])

df = pd.DataFrame(df, columns=["id", "category_id", "category_name", "bounding_box"])

temp = pd.read_csv("FullShot.csv", usecols=["id", "height", "width"])

df["id"] = df["id"].apply(lambda x: x.split(".")[0])
temp["id"] = temp["id"].apply(lambda x: x.split(".")[0])

df = df.merge(temp, on="id")

In [3]:
df["Xmin"] = df["bounding_box"].apply(lambda x: x[0])  # X_min of bounding box
df["Ymin"] = df["bounding_box"].apply(lambda x: x[1])  # Y_min of bounding box
df["Xmax"] = df["bounding_box"].apply(lambda x: x[2])  # X_max of bounding box
df["Ymax"] = df["bounding_box"].apply(lambda x: x[3])  # Y_max of bounding box

df["height_temp"] = df["Ymax"] - df["Ymin"]  # height not normalized
df["width_temp"] = df["Xmax"] - df["Xmin"]  # width not normalized
df["center x"] = df["Xmin"] + df["width_temp"] / 2  # X_center
df["center y"] = df["Ymin"] + df["height_temp"] / 2  # Y_center

df["center x"] = df["center x"] / df["width"]  # Normalized X_center
df["center y"] = df["center y"] / df["height"]  # Normalized Y_center
df["height"] = df["height_temp"] / df["height"]  # normalized height
df["width"] = df["width_temp"] / df["width"]  # normalized width
df["category_id"] = df["category_id"] - 1

# new dataframe in same order as required in YOLO
final_df = df[["id", "category_id", "center x", "center y", "width", "height"]]

In [4]:
final_df.head()

Unnamed: 0,id,category_id,center x,center y,width,height
0,1,12,0.530983,0.483173,0.190171,0.360577
1,1,4,0.519231,0.367788,0.188034,0.126603
2,2,12,0.5,0.403045,0.247863,0.421474
3,2,4,0.480769,0.269231,0.235043,0.150641
4,8,11,0.588889,0.59,0.804444,0.67


In [5]:
os.makedirs("./data/labels", exist_ok=True)

for img_value in final_df.groupby("id"):
    label_path = "./data/labels/" + img_value[0] + ".txt"
    img_value[1].drop("id", inplace=True, axis=1)
    img_value[1].to_csv(label_path, header=False, index=False, sep=" ")

In [16]:
if os.path.exists('./data/annos/'):
    for file in os.listdir('./data/annos'):
        os.remove('./data/annos/'+file)
    os.removedirs('./data/annos')

os.rename('./data/image/', './data/images/')

In [17]:
# Read images and labels
images = [
    os.path.join("./data/images/", file_path)
    for file_path in os.listdir("./data/images/")
]
labels = [
    os.path.join("./data/labels/", file_path)
    for file_path in os.listdir("./data/labels/")
]

images.sort()
labels.sort()

# Split the dataset into train-valid-test splits
train_images, val_images, train_labels, val_labels = train_test_split(
    images, labels, test_size=0.2, random_state=1
)
val_images, test_images, val_labels, test_labels = train_test_split(
    val_images, val_labels, test_size=0.15, random_state=1
)

In [18]:
#Utility function to move images 
def move_files_to_folder(files, final_folder):
    if not os.path.exists(final_folder):
        os.mkdir(final_folder)
    for file in files:
        try:
            shutil.move(file, final_folder)
        except:
            print(file)
            assert False

# Move the splits into their folders
move_files_to_folder(train_images, './data/images/train')
move_files_to_folder(val_images, './data/images/val/')
move_files_to_folder(test_images, './data/images/test/')
move_files_to_folder(train_labels, './data/labels/train/')
move_files_to_folder(val_labels, './data/labels/val/')
move_files_to_folder(test_labels, './data/labels/test/')