In [1]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import zipfile
import json
import os
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

### Step 0: Define Variables & Helper Functions

Define variables that are needed thoughout the data preparation pipeline. These include environment specific information like locations for images, training & test data and so on.

In [3]:
PROJECT_DIR = os.path.abspath('..')

# Path for the COCO label files
COCO_LABELS_PATH = os.path.join(PROJECT_DIR, 'labels', 'coco')

# Path for working data (ignored by git)
DATA_PATH = os.path.join(PROJECT_DIR, 'data')

In [4]:
def merge_coco(merged_coco, subset_coco):
    category_id_map = {}
    next_category_id = len(merged_coco["categories"]) + 1
    image_id_offset = len(merged_coco["images"])
    annotation_id_offset = len(merged_coco["annotations"])

    for category in merged_coco["categories"]:
        category_id_map[category["name"]] = category["id"]

    for category in subset_coco["categories"]:
        if category["name"] not in category_id_map.keys():
            category_id_map[category["name"]] = next_category_id
            next_category_id += 1

    for image in subset_coco["images"]:
        image["id"] += image_id_offset
        merged_coco["images"].append(image)

    for annotation in subset_coco["annotations"]:
        annotation["id"] += annotation_id_offset
        annotation["image_id"] += image_id_offset
        annotation["category_id"] = category_id_map[
            subset_coco["categories"][annotation["category_id"] - 1]["name"]
        ]
        merged_coco["annotations"].append(annotation)

    merged_coco["categories"] = [
        {"id": id, "name": name, "supercategory": ""}
        for name, id in category_id_map.items()
    ]

    return merged_coco

### Step 1: Extract Data

Unpack the zipped raw images provided to work with.

In [20]:
import os
import zipfile

# Define the direct path to the zip file
zip_file_path = r"C:\Users\Soham\Downloads\labels_coco-20240612T222844Z-001.zip"

# Define the directory to extract to
extract_path = r"C:\Users\Soham\Downloads\extracted_labels"

# Check if the file exists
if os.path.exists(zip_file_path):
    print(f"The file {zip_file_path} exists.")
    
    # Try extracting the zip file
    try:
        # Create the directory if it doesn't exist
        os.makedirs(extract_path, exist_ok=True)
        
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        print(f"Extracted all files to {extract_path}")
    except PermissionError as e:
        print(f"PermissionError: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print(f"The file {zip_file_path} does not exist.")


The file C:\Users\Soham\Downloads\labels_coco-20240612T222844Z-001.zip exists.
Extracted all files to C:\Users\Soham\Downloads\extracted_labels


### Step 2: Merge Labels

Process the subsets of labels provided in CoCo labelling format to merge the labelling done by the different students. The process combines the categories (i.e. labels), images and annotations of the different CoCo files so that we have one set of categories/labels

In [25]:
import os
import json

# Define your paths
COCO_LABELS_PATH = r'C:\Users\Soham\Downloads\extracted_labels\labels_coco'
DATA_PATH = r'C:\Users\Soham\Downloads'

# Initialize coco_merged dictionary
coco_merged = { 'categories': [], 'images': [], 'annotations': [] }

image_count = 0
annotation_count = 0

# Ensure COCO_LABELS_PATH exists and iterate over its folders
if os.path.exists(COCO_LABELS_PATH):
    for subset_folder in os.listdir(COCO_LABELS_PATH):
        label_file_path = os.path.join(COCO_LABELS_PATH, subset_folder, 'instances_default.json')

        # Check if the JSON file exists
        if os.path.exists(label_file_path):
            with open(label_file_path) as f:
                coco_json = json.load(f)
                image_count += len(coco_json['images'])
                annotation_count += len(coco_json['annotations'])
                print(f"Processing {subset_folder} with {len(coco_json['images'])} images and {len(coco_json['annotations'])} annotations")
                # Merge the current coco_json into coco_merged
                coco_merged = merge_coco(coco_merged, coco_json)
        else:
            print(f"File not found: {label_file_path}")

    # Dump coco_merged to a JSON file
    with open(os.path.join(DATA_PATH, 'coco-merged.json'), 'w') as outfile:
        json.dump(coco_merged, outfile)
        
    print(f"Total images: {image_count}, Total annotations: {annotation_count}")
else:
    print(f"Directory not found: {COCO_LABELS_PATH}")


File not found: C:\Users\Soham\Downloads\extracted_labels\labels_coco\coco-merged.json\instances_default.json
Processing labels_01 with 395 images and 601 annotations
Processing labels_02 with 395 images and 346 annotations
Processing labels_03 with 395 images and 346 annotations
Processing labels_04 with 395 images and 494 annotations
Processing labels_05 with 396 images and 633 annotations
Processing labels_06 with 395 images and 386 annotations
Processing labels_07 with 395 images and 423 annotations
Processing labels_08 with 395 images and 435 annotations
Processing labels_09 with 395 images and 398 annotations
Processing labels_10 with 396 images and 467 annotations
Total images: 3952, Total annotations: 4529


In [27]:
import os
import json

# Define your paths
COCO_LABELS_PATH = r'C:\Users\Soham\Downloads\extracted_labels\labels_coco'
DATA_PATH = r'C:\Users\Soham\Downloads'

# Initialize coco_merged dictionary
coco_merged = { 'categories': [], 'images': [], 'annotations': [] }

image_count = 0
annotation_count = 0

# Ensure COCO_LABELS_PATH exists and iterate over its folders
if os.path.exists(COCO_LABELS_PATH):
    for subset_folder in os.listdir(COCO_LABELS_PATH):
        label_file_path = os.path.join(COCO_LABELS_PATH, subset_folder, 'instances_default.json')

        # Check if the JSON file exists
        if os.path.exists(label_file_path):
            with open(label_file_path) as f:
                coco_json = json.load(f)
                image_count += len(coco_json['images'])
                annotation_count += len(coco_json['annotations'])
                print(f"Processing {subset_folder} with {len(coco_json['images'])} images and {len(coco_json['annotations'])} annotations")
                # Merge the current coco_json into coco_merged
                coco_merged = merge_coco(coco_merged, coco_json)
        else:
            print(f"File not found: {label_file_path}")

    # Dump coco_merged to a JSON file after the loop completes
    with open(os.path.join(DATA_PATH, 'coco-merged.json'), 'w') as outfile:
        json.dump(coco_merged, outfile)
        
    print(f"Total images: {image_count}, Total annotations: {annotation_count}")
else:
    print(f"Directory not found: {COCO_LABELS_PATH}")


File not found: C:\Users\Soham\Downloads\extracted_labels\labels_coco\coco-merged.json\instances_default.json
Processing labels_01 with 395 images and 601 annotations
Processing labels_02 with 395 images and 346 annotations
Processing labels_03 with 395 images and 346 annotations
Processing labels_04 with 395 images and 494 annotations
Processing labels_05 with 396 images and 633 annotations
Processing labels_06 with 395 images and 386 annotations
Processing labels_07 with 395 images and 423 annotations
Processing labels_08 with 395 images and 435 annotations
Processing labels_09 with 395 images and 398 annotations
Processing labels_10 with 396 images and 467 annotations
Total images: 3952, Total annotations: 4529


In [40]:
# Run some validation checks & print info
if len(coco_merged['images']) != image_count:
    print(f"Expected {image_count} images but got {len(coco_merged['images'])}")
else:
    print(f"{image_count} images in merged file")
if len(coco_merged['annotations']) != annotation_count:
    print(f"Expected {annotation_count} annotations but got {len(coco_merged['annotations'])}")
else:
    print(f"{annotation_count} annotations in merged file")

for category in coco_merged['categories']:
    print(f"ID: {category['id']}: {category['name']}")

Expected 3952 images but got 3612
Expected 4 annotations but got 4428
ID: 1: construction_work
ID: 2: defective_sign_post
ID: 3: drilling_ret_wall
ID: 4: sagging
ID: 8: deposit
ID: 9: pipeline
ID: 10: drilling


In [37]:
# Initialize a dictionary to store category-wise annotation counts
category_annotation_counts = {category['id']: 0 for category in coco_merged['categories']}

# Count annotations by category
for annotation in coco_merged['annotations']:
    category_id = annotation['category_id']
    if category_id in category_annotation_counts:
        category_annotation_counts[category_id] += 1
    else:
        category_annotation_counts[category_id] = 1

# Print category-wise annotation counts
for category in coco_merged['categories']:
    category_id = category['id']
    category_name = category['name']
    annotation_count = category_annotation_counts[category_id]
    print(f"Category: {category_name} (ID: {category_id}) - Annotations: {annotation_count}")


Category: construction_work (ID: 1) - Annotations: 3050
Category: defective_sign_post (ID: 2) - Annotations: 320
Category: drilling_ret_wall (ID: 3) - Annotations: 52
Category: sagging (ID: 4) - Annotations: 307
Category: deposit (ID: 8) - Annotations: 489
Category: pipeline (ID: 9) - Annotations: 206
Category: drilling (ID: 10) - Annotations: 4


### Step 3: Filter Categories, Annotations, Images

Filter categories, annotations and images based on the removal of certain labels/classes.

In [43]:
import os
import json

DATA_PATH = r'C:\Users\Soham\Downloads\extracted_labels\labels_coco'
FILTER_IDS = [5, 6, 7]

# Load coco_merged from file if not already loaded
coco_merged_path = os.path.join(DATA_PATH, 'coco-merged.json')
if os.path.exists(coco_merged_path):
    with open(coco_merged_path, 'r') as f:
        coco_merged = json.load(f)
else:
    raise FileNotFoundError(f'{coco_merged_path} not found. Please make sure to run the previous merging script.')

# Count categories and annotations before filtering
pre_filter_cat_count = len(coco_merged["categories"])
pre_filter_ann_count = len(coco_merged["annotations"])

# Filter categories and annotations based on FILTER_IDS
coco_merged["categories"] = [
    category
    for category in coco_merged["categories"]
    if category["id"] not in FILTER_IDS
]
coco_merged["annotations"] = [
    annotation
    for annotation in coco_merged["annotations"]
    if annotation["category_id"] not in FILTER_IDS
]

# Count categories and annotations after filtering
post_filter_cat_count = len(coco_merged["categories"])
post_filter_ann_count = len(coco_merged["annotations"])

print(f"Filtered {pre_filter_cat_count - post_filter_cat_count} categories")
print(f"Filtered {pre_filter_ann_count - post_filter_ann_count} annotations")

# Filter images that don't have annotations
pre_filter_img_count = len(coco_merged["images"])

# Create a set of image IDs that have annotations
annotated_image_ids = set(annotation["image_id"] for annotation in coco_merged["annotations"])

# Filter images
coco_merged["images"] = [
    image
    for image in coco_merged["images"]
    if image["id"] in annotated_image_ids
]

# Count images after filtering
post_filter_img_count = len(coco_merged["images"])
print(f"Filtered {pre_filter_img_count - post_filter_img_count} images")

# Save the filtered file
filtered_labels_path = os.path.join(DATA_PATH, "labels", "coco")
os.makedirs(filtered_labels_path, exist_ok=True)

filtered_json_path = os.path.join(filtered_labels_path, "instances_default.json")
with open(filtered_json_path, "w") as f:
    json.dump(coco_merged, f)

print(f"Filtered data saved to {filtered_json_path}")


Filtered 3 categories
Filtered 117 annotations
Filtered 366 images
Filtered data saved to C:\Users\Soham\Downloads\extracted_labels\labels_coco\labels\coco\instances_default.json


In [44]:
if coco_merged is None:
    coco_merged = json.load(open(os.path.join(DATA_PATH, 'coco-merged.json')))

print(f"{len(coco_merged['images'])} images in merged file")
print(f"{len(coco_merged['annotations'])} annotations in merged file")

3982 images in merged file
5016 annotations in merged file


### Step 4: Create Train/Test Split

Create a ~~stratified~~ (currently not stratified) train/test split and store the information.

In [45]:
# TBD: do a proper startified split
images = np.array([image["file_name"] for image in coco_merged['images']])
train, test = train_test_split(images, test_size=0.2, random_state=42)
json.dump({ "train": train.tolist(), "test": test.tolist() }, open(os.path.join(DATA_PATH, 'train_test_split.json'), 'w'))
print(f"Train: {len(train)} images, Test: {len(test)} images")

Train: 3185 images, Test: 797 images
