# Creation of COCO Dataset

### Virtual Environment: remote_sensing_v2 
### To create training and validation dataset from tiff and geojson to coco format for training the Instance Segmentation Model

In [1]:
import os
os.chdir(r'C:\Users\shubh\Documents\Analytics\Computer Vision\CB_Analytics_WS\GIS_WS\GIS_Roads_WS')

In [2]:
home_direc = os.getcwd()
print(home_direc)

C:\Users\shubh\Documents\Analytics\Computer Vision\CB_Analytics_WS\GIS_WS\GIS_Roads_WS


### Run this Notebook from here

##### Below Codes are for converting data to COCO dataset

In [3]:
import os
import rasterio
from osgeo import gdal
import numpy as np
from tqdm.auto import tqdm, trange
import geopandas as gpd
from osgeo import ogr
from shapely.geometry import Polygon
import matplotlib.pyplot as plt
import cv2

# Save the raster image
def writeTiff(im_data, save_path, new_transform, crs):
    _, height, width = im_data.shape
    with rasterio.open(save_path,
                       'w',
                       driver='GTiff',
                       height=height,
                       width=width,
                       count=3,
                       dtype=im_data.dtype,
                       crs=crs,
                       transform=new_transform
                       ) as dst:
        dst.write(im_data)
    return

# read the raster image
def readTiff(TifPath):
    dataset_img = rasterio.open(TifPath)
    width = dataset_img.width
    height = dataset_img.height
    crs = dataset_img.crs
    transform = dataset_img.transform # (xres, 0, xcord, 0, yres, ycord)
    img_array = dataset_img.read([1, 2, 3]) # get the data
    return img_array, width, height, crs, transform

# translate Gdal data to opencv format
def GdalData2OpencvData(GdalImg_data):
    OpencvImg_data = np.zeros((GdalImg_data.shape[1],GdalImg_data.shape[2],GdalImg_data.shape[0]),np.uint8)
    for i in range(GdalImg_data.shape[0]):
        OpencvImg_data[:,:,i] = GdalImg_data[GdalImg_data.shape[0]-i-1,:,:]
    return OpencvImg_data

In [None]:
import os
import rasterio
import geopandas as gpd
import numpy as np
from tqdm.auto import notebook_tqdm
import cv2
from sahi.utils.coco import Coco, CocoCategory, CocoImage, CocoAnnotation
from sahi.utils.file import save_json

def read_tiff(file_path):
    with rasterio.open(file_path) as src:
        im_data = src.read()
        width = src.width
        height = src.height
        crs = src.crs
        transform = src.transform
    return im_data, width, height, crs, transform

def gdal_data_to_opencv_data(im_data):
    if len(im_data.shape) == 3:
        return np.transpose(im_data, (1, 2, 0))
    else:
        return im_data

#home_direc = os.path.expanduser('~')
root_dir = os.path.join(home_direc, 'cropped_tiff')
root_geojson_dir = os.path.join(home_direc, 'cropped_geojson')

save_dir = os.path.join(home_direc, 'COCO_dataset_WS')
os.makedirs(save_dir, exist_ok=True)

tif_list = [x for x in os.listdir(root_dir) if x.endswith('.tif')]

# Init Coco object:
coco = Coco()
# Add categories (starting from id 0):
category_name = 'road'
coco.add_category(CocoCategory(id=0, name=category_name))

# Process each TIFF and corresponding GeoJSON file
for tif in notebook_tqdm(tif_list):
    img_path = os.path.join(root_dir, tif)
    im_data, width, height, crs, transform = read_tiff(img_path)
    img_name = os.path.splitext(os.path.split(img_path)[-1])[0]
    geojson_path = os.path.join(root_geojson_dir, img_name + '.geojson')
    
    if os.path.exists(geojson_path):
        annot_df = gpd.read_file(geojson_path)
        
        # Coordinates of the upper left corner 
        x0, y0 = transform[2], transform[5]
        xres, yres = transform[0], transform[4]
        
        # Convert the TIFF file to JPG
        im_data = gdal_data_to_opencv_data(im_data)
        img_save_dir = os.path.join(save_dir, 'images')
        os.makedirs(img_save_dir, exist_ok=True)
        img_save_path = os.path.join(img_save_dir, img_name + '.jpg')
        cv2.imwrite(img_save_path, im_data)
        
        # Create a CocoImage
        coco_image = CocoImage(file_name=os.path.join(img_name + '.jpg'), height=height, width=width)
        
        # Add annotations to CocoImage
        for polygon in annot_df['geometry']:
            seg_list = []
            if polygon.geom_type == 'MultiPolygon':
                for geom in polygon.geoms:
                    segmentation = np.array(geom.exterior.coords)
                    segmentation = np.ravel((segmentation - [x0, y0]) / [xres, yres])
                    seg_list.append(list(segmentation))
            else:
                segmentation = np.array(polygon.exterior.coords)
                segmentation = np.ravel((segmentation - [x0, y0]) / [xres, yres])
                seg_list.append(list(segmentation))
            
            bbox = np.array(polygon.bounds)
            bbox = [bbox[0]-x0, abs(bbox[1]-y0), (bbox[2]-bbox[0]) / xres, abs(bbox[3]-bbox[1]) / yres] # bbox(xmin, ymin, width, height)
            coco_image.add_annotation(
                CocoAnnotation(
                    segmentation=seg_list,
                    bbox=bbox,
                    category_id=0,
                    category_name=category_name
                )
            )
        
        # Add CocoImage to Coco object
        coco.add_image(coco_image)
    else:
        continue

# Convert Coco object to JSON and save
coco_json = coco.json
json_save_path = os.path.join(save_dir, 'coco_dataset.json')
save_json(coco_json, json_save_path)


In [33]:
# convert geojson to json file in COCO format

from unicodedata import category
from sahi.utils.coco import Coco, CocoCategory, CocoImage, CocoAnnotation
from sahi.utils.file import save_json
import os
import rasterio
import geopandas as gpd
from tqdm.auto import tqdm


root_dir = os.path.join(home_direc, 'cropped_tiff') 
root_geojson_dir = os.path.join(home_direc, 'cropped_geojson') 

save_dir = os.path.join(home_direc, 'COCO_dataset_WS')   # path to save folder
os.makedirs(save_dir, exist_ok=True)

tif_list = [x for x in os.listdir(root_dir) if x.endswith('.tif')]

# Init Coco object:
coco = Coco()
# Add categories (starting from id 0):
category_name = 'road'
coco.add_category(CocoCategory(id=0, name=category_name))

# read image and geosjon file
for tif in tqdm(tif_list):
  img_path = os.path.join(root_dir, tif)
  im_data, width, height, crs, transform = readTiff(img_path)
  img_name = os.path.splitext(os.path.split(img_path)[-1])[0]
  geojson_path = os.path.join(root_geojson_dir, img_name + '.geojson')
  # judge whether geojson file exist or not
  if os.path.exists(geojson_path):
    annot_df = gpd.read_file(geojson_path)
    # Coordinates of the upper left corner 
    x0 = transform[2]
    y0 = transform[5]
    xres = transform[0]
    yres = transform[4]
    # convert the tif file to jpg
    # transfer gdal data to opencv
    im_data = GdalData2OpencvData(im_data)
    img_save_dir = os.path.join(save_dir, 'images') # image save folder
    os.makedirs(img_save_dir, exist_ok=True)
    img_save_path = os.path.join(img_save_dir, img_name + '.jpg')
    cv2.imwrite(img_save_path, im_data)
    # create a coco image:
    #coco_image = CocoImage(file_name = os.path.join(root_dir, 'COCO_json/val/image', img_name + '.jpg'), height=height, width=width)
    #coco_image = CocoImage(file_name = os.path.join('THA/thailand_img_dir/processed_data/COCO_json/val/image', img_name + '.jpg'), height=height, width=width)
    coco_image = CocoImage(file_name = os.path.join(img_name + '.jpg'), height=height, width=width)
    #coco_image = CocoImage(file_name = os.path.join(img_save_path), height=height, width=width)
 

  0%|          | 0/156 [00:00<?, ?it/s]

In [34]:
# Add annotations to CocoImage
for polygon in annot_df['geometry']:
    seg_list = []
    if polygon.geom_type == 'MultiPolygon':
        for geom in polygon.geoms:
            segmentation = np.array(geom.exterior.coords)
            segmentation = np.ravel((segmentation - [x0, y0]) / [xres, yres])
            seg_list.append(list(segmentation))
    else:
        segmentation = np.array(polygon.exterior.coords)  # Get global coordinates
        segmentation = np.ravel((segmentation - [x0, y0]) / [xres, yres])  # Convert to local coordinates
        seg_list.append(list(segmentation))
    
    bbox = np.array(polygon.bounds)
    bbox = [bbox[0]-x0, abs(bbox[1]-y0), (bbox[2]-bbox[0]) / xres, abs(bbox[3]-bbox[1]) / yres]  # bbox(xmin, ymin, width, height)
    coco_image.add_annotation(
        CocoAnnotation(
            segmentation=seg_list,
            bbox=bbox,
            category_id=0,
            category_name=category_name
        )
    )

# Add CocoImage to Coco object
coco.add_image(coco_image)


In [35]:
coco_json = coco.json
json_save_path = os.path.join(save_dir, 'coco_dataset' + '.json')
save_json(coco_json, json_save_path)

In [20]:
#IGNORE
#Slice COCO dataset images and annotations into grids:
from sahi.slicing import slice_coco

coco_dict, coco_path = slice_coco(
    coco_annotation_file_path="coco_dataset_ws/coco_dataset.json",
    image_dir="coco_dataset_ws/images",
    slice_height=640,
    slice_width=640,
    overlap_height_ratio=0.2,
    overlap_width_ratio=0.2
)

TypeError: slice_coco() missing 1 required positional argument: 'output_coco_annotation_file_name'

In [10]:
#IGNORE
#Split COCO dataset into train/val:
from sahi.utils.coco import Coco
from sahi.utils.file import save_json

# specify coco dataset path
coco_path = "coco_dataset_ws/coco_dataset.json"

# init Coco object
coco = Coco.from_coco_dict_or_path(coco_path)

# split COCO dataset with a 85% train/15% val split
result = coco.split_coco_as_train_val(
  train_split_rate=0.85
)

# export train val split files
save_json(result["train_coco"].json, "train_split.json")
save_json(result["val_coco"].json, "val_split.json")

indexing coco dataset annotations...


Loading coco annotations: 100%|████████████| 2386/2386 [00:23<00:00, 102.44it/s]


In [12]:
#Filter/Update COCO dataset by categories:
from sahi.utils.coco import Coco
from sahi.utils.file import save_json

# init Coco objects by specifying coco dataset paths and image folder directories
coco = Coco.from_coco_dict_or_path("coco_dataset_ws/coco_dataset.json")

# select only 3 categories; and map them to ids 1, 2 and 3
desired_name2id = {
  "building": 1
}
coco.update_categories(desired_name2id)

# export updated/filtered COCO dataset
save_json(coco.json, "coco_dataset_ws/updated_coco.json")

indexing coco dataset annotations...


Loading coco annotations: 100%|████████████| 7789/7789 [00:40<00:00, 193.50it/s]


In [None]:
#IGNORE
#Filter COCO dataset by annotation area:
from sahi.utils.coco import Coco
from sahi.utils.file import save_json

# init Coco objects by specifying coco dataset paths and image folder directories
coco = Coco.from_coco_dict_or_path("coco.json")

# filter out images that contain annotations with smaller area than 50
area_filtered_coco = coco.get_area_filtered_coco(min=50)
# filter out images that contain annotations with smaller area than 50 and larger area than 10000
area_filtered_coco = coco.get_area_filtered_coco(min=50, max=10000)
# filter out images with seperate area intervals per category
intervals_per_category = {
  "human": {"min": 20, "max": 10000},
  "vehicle": {"min": 50, "max": 15000},
}
area_filtered_coco = coco.get_area_filtered_coco(intervals_per_category=intervals_per_category)

# export filtered COCO dataset
save_json(area_filtered_coco.json, "area_filtered_coco.json")

In [13]:
#Filter out images that does not contain any annotation:
from sahi.utils.coco import Coco

# set ignore_negative_samples as False if you want images without annotations present in json and yolov5 exports
coco = Coco.from_coco_dict_or_path("coco_dataset_ws/coco_dataset.json", ignore_negative_samples=True)


indexing coco dataset annotations...


Loading coco annotations: 100%|████████████| 7789/7789 [00:44<00:00, 174.26it/s]


In [None]:
#IGNORE
#Merge COCO dataset files:
from sahi.utils.coco import Coco
from sahi.utils.file import save_json

# init Coco objects by specifying coco dataset paths and image folder directories
coco_1 = Coco.from_coco_dict_or_path("coco1.json", image_dir="images_1/")
coco_2 = Coco.from_coco_dict_or_path("coco2.json", image_dir="images_2/")

# merge Coco datasets
coco_1.merge(coco_2)

# export merged COCO dataset
save_json(coco_1.json, "merged_coco.json")

In [13]:
#Convert COCO dataset to ultralytics/yolov5 format:
from sahi.utils.coco import Coco

# init Coco object
coco = Coco.from_coco_dict_or_path("coco_dataset_ws/coco_dataset.json", image_dir="coco_dataset_ws/images/")

# export converted YoloV5 formatted dataset into given output_dir with a 85% train/15% val split
coco.export_as_yolov5(
  output_dir="output/folder/dir",
  train_split_rate=0.85
)


indexing coco dataset annotations...


Loading coco annotations: 100%|████████████| 2386/2386 [00:23<00:00, 100.00it/s]
06/11/2023 16:44:48 - INFO - sahi.utils.coco -   generating image symlinks and annotation files for yolov5...
100%|██████████████████████████████████████| 2028/2028 [00:11<00:00, 178.57it/s]
06/11/2023 16:44:59 - INFO - sahi.utils.coco -   generating image symlinks and annotation files for yolov5...
100%|████████████████████████████████████████| 358/358 [00:02<00:00, 172.16it/s]


In [None]:
#IGNORE
#Convert train/val COCO dataset to ultralytics/yolov5 format:
from sahi.utils.coco import Coco, export_coco_as_yolov5

# init Coco object
train_coco = Coco.from_coco_dict_or_path("train_coco.json", image_dir="coco_images/")
val_coco = Coco.from_coco_dict_or_path("val_coco.json", image_dir="coco_images/")

# export converted YoloV5 formatted dataset into given output_dir with given train/val split
data_yml_path = export_coco_as_yolov5(
  output_dir="output/folder/dir",
  train_coco=train_coco,
  val_coco=val_coco
)


In [14]:
#Get dataset stats:
from sahi.utils.coco import Coco

# init Coco object
coco = Coco.from_coco_dict_or_path("coco_dataset_ws/coco_dataset.json")

# get dataset stats
coco.stats

indexing coco dataset annotations...


Loading coco annotations: 100%|████████████| 7789/7789 [00:42<00:00, 182.67it/s]


{'num_images': 7789,
 'num_annotations': 446692,
 'num_categories': 1,
 'num_negative_images': 0,
 'num_images_per_category': {'building': 7789},
 'num_annotations_per_category': {'building': 446692},
 'min_num_annotations_in_image': 1,
 'max_num_annotations_in_image': 398,
 'avg_num_annotations_in_image': 57.34908203877263,
 'min_annotation_area': 0,
 'max_annotation_area': 352745,
 'avg_annotation_area': 2407.161675158722,
 'min_annotation_area_per_category': {'building': 0},
 'max_annotation_area_per_category': {'building': 352745}}

In [15]:
#Get dataset stats:
from sahi.utils.coco import Coco

# init Coco object
coco = Coco.from_coco_dict_or_path("coco_dataset_ws/updated_coco.json")

# get dataset stats
coco.stats

indexing coco dataset annotations...


Loading coco annotations: 100%|████████████| 7789/7789 [00:40<00:00, 191.29it/s]


{'num_images': 7789,
 'num_annotations': 446692,
 'num_categories': 1,
 'num_negative_images': 0,
 'num_images_per_category': {'building': 7789},
 'num_annotations_per_category': {'building': 446692},
 'min_num_annotations_in_image': 1,
 'max_num_annotations_in_image': 398,
 'avg_num_annotations_in_image': 57.34908203877263,
 'min_annotation_area': 0,
 'max_annotation_area': 352745,
 'avg_annotation_area': 2407.161675158722,
 'min_annotation_area_per_category': {'building': 0},
 'max_annotation_area_per_category': {'building': 352745}}

In [None]:
#IGNORE
#Remove invalid coco results:
from sahi.utils.file import save_json
from sahi.utils.coco import remove_invalid_coco_results

# remove invalid predictions from COCO results JSON
coco_results = remove_invalid_coco_results("coco_dataset_ws/coco_dataset.json")

# export processed COCO results
save_json(coco_results, "coco_dataset_ws/fixed_coco_result.json")

# bonus: remove invalid predictions from COCO results JSON by giving COCO
# dataset path to also filter out bbox results exceeding image height&width
#coco_results = remove_invalid_coco_results("coco_dataset_ws/coco_result.json", "coco_dataset_ws/coco_dataset.json")

In [None]:
#IGNORE
#Get COCO with clipped bounding boxes:
from sahi.utils.coco import Coco
from sahi.utils.file import save_json

# Clip overflowing bounding boxes to image width & height
coco = Coco.from_coco_dict_or_path(coco_path, clip_bboxes_to_img_dims=True)

#OR

# apply to your already created coco object
coco = coco.get_coco_with_clipped_bboxes()

In [None]:
#IGNORE
#Export your clipped_bboxed_coco:
save_json(coco.json, "coco.json")