In [1]:
import json
import pathlib
from pprint import pprint

import matplotlib.pyplot as plt
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import polars as pl

import srcdir

TRAIN_IMAGE_DIR = pathlib.Path('../data/train_bbox_images')
TRAIN_ANNOTATIONS = pathlib.Path('../data/train_bbox_annotations.json')

/home/tomo/workspace/src is added into sys.path


In [2]:
with open(TRAIN_ANNOTATIONS, 'r') as f:
    annotations = json.load(f)['images']
    
bboxes = []
for annotation in annotations:
    _anno = {
        'image_id': int(annotation['file_name'].split('.')[0].split('_')[-1]),
        'file_name': annotation['file_name'],
        'image_width': annotation['width'],
        'image_height': annotation['height'],
    }
    for i, bbox in enumerate(annotation['annotations']):
        _bbox = {
            'bbox_id': i,
            'class': bbox['class'],
            'x_min': bbox['bbox'][0],
            'y_min': bbox['bbox'][1],
            'bbox_width': bbox['bbox'][2],
            'bbox_height': bbox['bbox'][3],
            'bbox_area': bbox['bbox'][2] * bbox['bbox'][3],
        }
        bboxes.append(_anno | _bbox)

bbox_df = pl.DataFrame(bboxes).sort(by=('image_id', 'bbox_id'))
bbox_df

image_id,file_name,image_width,image_height,bbox_id,class,x_min,y_min,bbox_width,bbox_height,bbox_area
i64,str,i64,i64,i64,str,f64,f64,f64,f64,f64
0,"""train_0.tif""",500,375,0,"""vacant_lot""",311.0,36.246,35.0,45.146,1580.11
0,"""train_0.tif""",500,375,1,"""vacant_lot""",121.0,156.0,203.0,144.0,29232.0
1,"""train_1.tif""",500,375,0,"""vacant_lot""",198.0,231.0,56.0,34.0,1904.0
2,"""train_2.tif""",500,375,0,"""vacant_lot""",61.501,168.404,47.705,45.298,2160.94109
3,"""train_3.tif""",500,375,0,"""vacant_lot""",327.0,196.0,33.0,57.0,1881.0
…,…,…,…,…,…,…,…,…,…,…
598,"""train_598.tif""",500,375,11,"""vacant_lot""",470.0,162.0,27.0,28.0,756.0
599,"""train_599.tif""",500,375,0,"""vacant_lot""",71.509,259.731,53.675,67.836,3641.0973
599,"""train_599.tif""",500,375,1,"""vacant_lot""",223.0,136.0,44.0,44.0,1936.0
599,"""train_599.tif""",500,375,2,"""vacant_lot""",217.0,185.0,43.0,45.0,1935.0


In [3]:
img_df = bbox_df.group_by('image_id') \
                .agg(bbox_count=pl.col('bbox_id').max()+1,
                     total_area=pl.col('bbox_area').sum()) \
                .sort(by='image_id')

img_df = img_df.with_columns(
    bin_bbox_count=pl.when(pl.col('bbox_count') < 5).then(0) \
                     .when(pl.col('bbox_count') < 10).then(1) \
                     .otherwise(2)
)

img_df = img_df.with_columns(
    bin_total_area=pl.when(pl.col('total_area') < 2000).then(0) \
                     .when(pl.col('total_area') < 5000).then(1) \
                     .when(pl.col('total_area') < 20000).then(2) \
                     .when(pl.col('total_area') < 50000).then(3) \
                     .otherwise(4)
)

img_df

image_id,bbox_count,total_area,bin_bbox_count,bin_total_area
i64,i64,f64,i32,i32
0,2,30812.11,0,3
1,1,1904.0,0,0
2,1,2160.94109,0,1
3,9,8434.0,1,2
4,3,3036.638,0,1
…,…,…,…,…
595,2,21063.0,0,3
596,4,9910.0,0,2
597,3,11522.248463,0,2
598,12,6323.054347,2,2


In [4]:
kfold = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_splits = kfold.split(
        X=img_df['image_id'].to_numpy(),
        y=img_df.select('bin_bbox_count', 'bin_total_area').to_numpy()
    )

folds = {i: {
                'train': train_indices.tolist(),
                'valid': test_indices.tolist()
            } for i, (train_indices, test_indices) in enumerate(cv_splits)}

with open('../data/cv.json', 'w') as f:
    json.dump(folds, f, indent=2)