* Download individual annotation files labelled using label-studio
* Some of the images labelled don't have object in them at all, so annotation file is remapped to take care of it.
* In addition, filter out the images that have object in them so that only those images can be used for further processing
* Also, combine v1 & v2 annotation files

In [None]:
!pip -q install pycocotools
!pip -q install albumentations
!pip -q install torch torchvision
!pip -q install matplotlib seaborn imutils opencv-contrib-python scikit-learn
!pip -q install pandas mapcalc boto3
!sudo apt-get update && sudo apt-get install ffmpeg libsm6 libxext6  -y

In [None]:
# Add src folder to the path
import sys; sys.path.append("../src")
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
# Import functions from annotator file
import json
from pathlib import Path
from annotator import (
    _read_json_file,
    combine_coco_files,
    combine_two_coco_files,
    remap_coco_annotations,
)

In [None]:
coco_data_v1 = remap_coco_annotations(Path("../data/annotations/v1.json"), selected_labels=[0])
coco_data_v2 = remap_coco_annotations(Path("../data/annotations/v2.json"), selected_labels=[0])
combined_coco_path = '../data/annotations/v1_v2.json'

combined_coco = combine_coco_files([coco_data_v1, coco_data_v2])

In [None]:
# when exporting coco file from label-studio, filename is modified a little bit.
# so rename the filenames in the coco file to actual image names
# Export the final coco file with the modified file names

def extract_text_after_first_dash(s):
    # Split the string at the first dash
    parts = s.split('-', 1)
    # Return the part after the first dash
    return parts[1] if len(parts) > 1 else ''

def modify_file_names(images):
    for image in images:
        if 'file_name' in image:
            original_file_name = image['file_name']
            modified_file_name = extract_text_after_first_dash(original_file_name)
            image['file_name'] = modified_file_name

modify_file_names(combined_coco["images"])

In [None]:
with open(combined_coco_path, "w") as f:
    json.dump(combined_coco, f)

len(combined_coco['images']), len(combined_coco['annotations'])

In [None]:
# Extract only the images that are in the final coco file
images_with_object = [item['file_name'] for item in combined_coco['images']]

# print(images_with_object)

In [None]:
# Move the images that have object in them to a different folder
import os
import shutil

# Replace these with your actual paths
source_dir = '../data/images_v2'
destination_dir = '../data/images_v2_filtered'

# Ensure destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Move each file
for file_name in images_with_object:
    source_path = os.path.join(source_dir, file_name)
    destination_path = os.path.join(destination_dir, file_name)

    try:
        shutil.move(source_path, destination_path)
        # print(f"Moved: {file_name}")
    except FileNotFoundError:
        print(f"File not found: {file_name}")

### Generate sample annotation file of 0.2 samples to use for creating augmented datasets
* A sample of the images is taken to generate augmentations on them, sample annotation file is generated before hand so that the same set of sample can be used to test on different types of models to train

In [None]:
### SAMPLE ANNOTATION file
sample_size=0.2
# Load the existing COCO file
with open(Path("../data/annotations_v1_v2/coco_v1_v2.json"), 'r') as f:
    coco_data = json.load(f)

# 20% of images in the data
aug_size = int(len(coco_data['images'])*sample_size)

# Select a subset of image IDs
image_ids = [image['id'] for image in coco_data['images']]

random.seed(42)

# Adjust 'k' to the desired number of samples
aug_image_ids = random.sample(image_ids, k=aug_size)
print(aug_image_ids[:5], "\n")

# Filter images and annotations based on the selected image IDs
aug_images = [image for image in coco_data['images'] if image['id'] in aug_image_ids]
aug_annotations = [annotation for annotation in coco_data['annotations'] if annotation['image_id'] in aug_image_ids]
print(len(aug_images), len(aug_annotations))

# Create the sample COCO file
aug_coco_data = {
    'images': aug_images,
    'annotations': aug_annotations,
    'categories': coco_data['categories'],
    'info': coco_data['info']
}

# Save the sample COCO file
with open(f"../data/annotations_v1_v2/coco_v1_v2_{sample_size}.json", 'w') as f:
    json.dump(aug_coco_data, f)

In [None]:
# Unzip folder
import zipfile
import shutil

# Zipped folder path
zipped_folder_path = '../data/sample_images.zip'

# Destination folder path
destination_folder_path = '../data/sample_images'

# Open the zipped folder
with zipfile.ZipFile(zipped_folder_path, 'r') as zip_file:

    # Extract all of the files from the zipped folder to the current working directory
    zip_file.extractall(destination_folder_path)

    # Close the zipped folder
    zip_file.close()

In [None]:
# zip a folder
import os
import zipfile

def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arcname)

# Example usage
zip_folder('../data/images', '../data/images.zip')

In [None]:
# Delete corrupted images
from PIL import Image
import os

def check_and_delete_all_corrupted_images(directory):
    corrupted_files = []

    # Scan all files and collect corrupted ones
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        try:
            with Image.open(filepath) as img:
                img.verify()
        except Exception as e:
            print(f"Identified corrupted image: {filepath} â€” {e}")
            corrupted_files.append(filepath)

    # # Delete all corrupted files
    # for filepath in corrupted_files:
    #     try:
    #         os.remove(filepath)
    #         print(f"Deleted corrupted image: {filepath}")
    #     except Exception as e:
    #         print(f"Failed to delete {filepath}: {e}")


check_and_delete_all_corrupted_images("../data/images")