In [1]:
import json
import pandas as pd
from collections import defaultdict
from pycocotools.coco import COCO
from sklearn.model_selection import StratifiedGroupKFold

f = open('./open/classes.txt', 'r')
lines = [list(line.strip().split(',')) for line in f.readlines()]
f.close()

encode = {}
decode = {}
for cls_num, cls_name in lines:
    encode[int(cls_num)] = cls_name
    decode[cls_name] = int(cls_num)

print(encode)
print(decode)

{0: 'chevrolet_malibu_sedan_2012_2016', 1: 'chevrolet_malibu_sedan_2017_2019', 2: 'chevrolet_spark_hatchback_2016_2021', 3: 'chevrolet_trailblazer_suv_2021_', 4: 'chevrolet_trax_suv_2017_2019', 5: 'genesis_g80_sedan_2016_2020', 6: 'genesis_g80_sedan_2021_', 7: 'genesis_gv80_suv_2020_', 8: 'hyundai_avante_sedan_2011_2015', 9: 'hyundai_avante_sedan_2020_', 10: 'hyundai_grandeur_sedan_2011_2016', 11: 'hyundai_grandstarex_van_2018_2020', 12: 'hyundai_ioniq_hatchback_2016_2019', 13: 'hyundai_sonata_sedan_2004_2009', 14: 'hyundai_sonata_sedan_2010_2014', 15: 'hyundai_sonata_sedan_2019_2020', 16: 'kia_carnival_van_2015_2020', 17: 'kia_carnival_van_2021_', 18: 'kia_k5_sedan_2010_2015', 19: 'kia_k5_sedan_2020_', 20: 'kia_k7_sedan_2016_2020', 21: 'kia_mohave_suv_2020_', 22: 'kia_morning_hatchback_2004_2010', 23: 'kia_morning_hatchback_2011_2016', 24: 'kia_ray_hatchback_2012_2017', 25: 'kia_sorrento_suv_2015_2019', 26: 'kia_sorrento_suv_2020_', 27: 'kia_soul_suv_2014_2018', 28: 'kia_sportage_suv_

In [2]:
base_dir = './open/train'
anno_dir = './open/train.json'
target_json = json.load(open(anno_dir))
tangent_dict = defaultdict(list)

# 클래스별 bbox의 tangent 값 저장 & 클래스별 bbox 개수 출력
for info in target_json['annotations']:
    category_id = info['category_id']
    areas = info['area']
    xmin, ymin, xmax, ymax = info['bbox']
    w = xmax - xmin
    h = ymax - ymin
    tan = h / w
    tangent_dict[category_id].append(tan)
    
for k,v in tangent_dict.items():
    print(encode[k] , '->', len(v))


chevrolet_malibu_sedan_2012_2016 -> 500
hyundai_sonata_sedan_2019_2020 -> 500
kia_ray_hatchback_2012_2017 -> 500
kia_stonic_suv_2017_2019 -> 500
kia_sorrento_suv_2015_2019 -> 500
chevrolet_trailblazer_suv_2021_ -> 500
chevrolet_spark_hatchback_2016_2021 -> 500
hyundai_ioniq_hatchback_2016_2019 -> 500
kia_carnival_van_2021_ -> 500
kia_soul_suv_2014_2018 -> 500
ssangyong_tivoli_suv_2016_2020 -> 500
genesis_gv80_suv_2020_ -> 500
kia_morning_hatchback_2004_2010 -> 500
hyundai_avante_sedan_2011_2015 -> 500
kia_k5_sedan_2020_ -> 500
renault_sm3_sedan_2015_2018 -> 500
kia_k7_sedan_2016_2020 -> 500
hyundai_sonata_sedan_2004_2009 -> 500
kia_sorrento_suv_2020_ -> 500
kia_sportage_suv_2016_2020 -> 500
genesis_g80_sedan_2016_2020 -> 500
hyundai_grandstarex_van_2018_2020 -> 500
hyundai_grandeur_sedan_2011_2016 -> 500
kia_mohave_suv_2020_ -> 500
kia_morning_hatchback_2011_2016 -> 500
hyundai_avante_sedan_2020_ -> 500
chevrolet_trax_suv_2017_2019 -> 500
hyundai_sonata_sedan_2010_2014 -> 500
renault_x

In [3]:
# 클래스별 tanget 값의 0.3분위수 구하기
quantiles_dict = defaultdict(list)
for k,v in tangent_dict.items():
    quantiles_dict[encode[k]] = [round(q, 2) for q in pd.Series(v).quantile([0.3, 0.7]).tolist()]

for k,v in quantiles_dict.items():
    print(k, '->', v)


chevrolet_malibu_sedan_2012_2016 -> [0.93, 1.04]
hyundai_sonata_sedan_2019_2020 -> [0.93, 1.02]
kia_ray_hatchback_2012_2017 -> [1.16, 1.3]
kia_stonic_suv_2017_2019 -> [0.99, 1.12]
kia_sorrento_suv_2015_2019 -> [1.04, 1.19]
chevrolet_trailblazer_suv_2021_ -> [1.03, 1.1]
chevrolet_spark_hatchback_2016_2021 -> [1.1, 1.17]
hyundai_ioniq_hatchback_2016_2019 -> [0.94, 1.03]
kia_carnival_van_2021_ -> [1.03, 1.16]
kia_soul_suv_2014_2018 -> [1.04, 1.17]
ssangyong_tivoli_suv_2016_2020 -> [1.01, 1.13]
genesis_gv80_suv_2020_ -> [1.03, 1.16]
kia_morning_hatchback_2004_2010 -> [1.06, 1.18]
hyundai_avante_sedan_2011_2015 -> [0.94, 1.04]
kia_k5_sedan_2020_ -> [0.93, 1.03]
renault_sm3_sedan_2015_2018 -> [0.96, 1.07]
kia_k7_sedan_2016_2020 -> [0.97, 1.07]
hyundai_sonata_sedan_2004_2009 -> [0.92, 1.04]
kia_sorrento_suv_2020_ -> [1.05, 1.18]
kia_sportage_suv_2016_2020 -> [1.02, 1.14]
genesis_g80_sedan_2016_2020 -> [0.93, 1.03]
hyundai_grandstarex_van_2018_2020 -> [1.16, 1.31]
hyundai_grandeur_sedan_2011_2

In [4]:
# FOLD 나누기
# 그룹 생성: Y를 statify하게 나누고, 같은 GROUP(같은 이미지의 어노테이션)의 이미지는 같은 fold에 들어가도록
# 그룹 = 이미지 이름
# 라벨 = 클래스 & tan 값으로 총 34 * 3 = 102개
coco = COCO(anno_dir)
class_labels = [category['name'] for _, category in coco.cats.items()]
num_classes = len(class_labels)

X = [] # 입력데이터(이미지 이름) -> GROUP과 동일
Y = [] # 라벨(클래스)
GROUP = [] # 그룹(이미지 이름)
image_ids = coco.getImgIds() # 이미지 index
ann_cnt = 0 # 결과 비교할 어노테이션 개수 카운트
for image_id in image_ids:
    img_info = coco.loadImgs(image_id)[0]
    
    ann_ids = coco.getAnnIds(imgIds=image_id)
    anns = coco.loadAnns(ann_ids)
    for ann in anns:
        category_id = ann['category_id']
        xmin, ymin, xmax, ymax = ann['bbox']
        w = xmax - xmin
        h = ymax - ymin
        tan = h / w
        
        # 삼분위수 기준으로 그룹 생성
        if quantiles_dict[encode[category_id]][0] < tan < quantiles_dict[encode[category_id]][1]:
            category_id += num_classes
        elif quantiles_dict[encode[category_id]][1] <= tan:
            category_id += num_classes * 2

        X.append(img_info['file_name'])
        Y.append(category_id)
        GROUP.append(img_info['file_name'])
        ann_cnt += 1

print(f'X={len(X)}개, Y={len(Y)}개, G={len(GROUP)}개')
print(ann_cnt == len(X) == len(Y) == len(GROUP))
print(f'X={X[:5]}, min={min(X)}, max={max(X)}')
print(f'Y={Y[:5]}, min={min(Y)}, max={max(Y)}')
print(f'G={GROUP[:5]}, min={min(GROUP)}, max={max(GROUP)}')

loading annotations into memory...
Done (t=0.07s)
creating index...
index created!
X=17000개, Y=17000개, G=17000개
True
X=['syn_00710.png', 'syn_00710.png', 'syn_00710.png', 'syn_06361.png', 'syn_06361.png'], min=syn_00000.png, max=syn_06480.png
Y=[0, 49, 58, 63, 93], min=0, max=101
G=['syn_00710.png', 'syn_00710.png', 'syn_00710.png', 'syn_06361.png', 'syn_06361.png'], min=syn_00000.png, max=syn_06480.png


In [5]:
# 그룹으로 fold 나누기
n_splits = 5
fold_results = dict()
print()

for i, (train_idx, test_idx) in enumerate(StratifiedGroupKFold(n_splits=n_splits).split(X, Y, GROUP)):
    fold_results[i] = {
        'train': sorted(list(set([X[idx] for idx in train_idx]))), # annotation_id를 이미지 이름으로 변경
        'valid': sorted(list(set([X[idx] for idx in test_idx]))), # annotation_id를 이미지 이름으로 변경
    }
    print(f'Fold {i} -> train: {len(fold_results[i]["train"])}, valid: {len(fold_results[i]["valid"])}, total = {len(fold_results[i]["train"]) + len(fold_results[i]["valid"])}', end=' -> ')
    # train과 valid에 중복되는 이미지가 있는지 확인 & train + valid = 전체 어노테이션 개수 확인
    print('Test Passed' if len(set(fold_results[i]['train']) & set(fold_results[i]['valid'])) == 0 and len(set(fold_results[i]['train']) | set(fold_results[i]['valid'])) == len(coco.getImgIds()) else 'Test Failed')
    assert len(set(fold_results[i]['train']) & set(fold_results[i]['valid'])) == 0 and len(set(fold_results[i]['train']) | set(fold_results[i]['valid'])) == len(coco.getImgIds()), 'Test Failed'
    # fold_results[i]['train'] = sorted(list(set(fold_results[i]['train'])))
    # fold_results[i]['valid'] = sorted(list(set(fold_results[i]['valid'])))
    fold_results[i]['train'] = list(set(fold_results[i]['train']))
    fold_results[i]['valid'] = list(set(fold_results[i]['valid']))


Fold 0 -> train: 5186, valid: 1295, total = 6481 -> Test Passed
Fold 1 -> train: 5186, valid: 1295, total = 6481 -> Test Passed
Fold 2 -> train: 5185, valid: 1296, total = 6481 -> Test Passed
Fold 3 -> train: 5183, valid: 1298, total = 6481 -> Test Passed
Fold 4 -> train: 5184, valid: 1297, total = 6481 -> Test Passed


In [6]:
print(fold_results[0]['train'][:5])
print(fold_results[0]['valid'][:5])

['syn_01929.png', 'syn_04212.png', 'syn_00182.png', 'syn_00583.png', 'syn_06149.png']
['syn_04397.png', 'syn_04988.png', 'syn_05253.png', 'syn_04293.png', 'syn_01504.png']


In [7]:
# 이미지 이름으로 annotation 가져올 때 필요한 dict 생성
def get_coco_dict(coco):
    coco_dict = dict()
    for image_id in coco.getImgIds():
        img_info = coco.loadImgs(image_id)[0]
        ann_ids = coco.getAnnIds(imgIds=image_id)
        anns = coco.loadAnns(ann_ids)
        coco_dict[img_info['file_name']] = [img_info, anns]
    return coco_dict

coco_dict = get_coco_dict(coco)
for k,v in coco_dict.items():
    print(k, v)
    break

syn_00710.png [{'width': 1920, 'height': 1040, 'file_name': 'syn_00710.png', 'license': 0, 'flickr_url': 'null', 'coco_url': 'null', 'date_captured': 'null', 'id': 0}, [{'image_id': 0, 'category_id': 0, 'area': 68172, 'bbox': [1000, 167, 1276, 414], 'iscrowd': 0, 'id': 0}, {'image_id': 0, 'category_id': 15, 'area': 94248, 'bbox': [794, 488, 1102, 794], 'iscrowd': 0, 'id': 1}, {'image_id': 0, 'category_id': 24, 'area': 61476, 'bbox': [311, 163, 529, 445], 'iscrowd': 0, 'id': 2}]]


In [8]:
# original_json의 포멧인 빈 coco json 객체 생성 함수
def get_coco_json(original_json):
    coco_json = {
        'info': original_json['info'],
        'licenses': original_json['licenses'],
        'categories': original_json['categories'],
        'images': [],
        'annotations': [],
    }
    return coco_json

import os
base_dir = f'./open/{n_splits}fold'
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

for i in fold_results.keys():
    # train fold json 생성
    train_fold_json = get_coco_json(target_json)
    for img_name in fold_results[i]['train']:
        img_info, anns = coco_dict[img_name]
        train_fold_json['images'].append(img_info)
        train_fold_json['annotations'] += anns[:]
    
    with open(os.path.join(base_dir,f'train{i}.json'), 'w') as outfile:
        json.dump(train_fold_json, outfile, indent=2)
    
    print(f'train{i}.json saved')

    # valid fold json 생성
    valid_fold_json = get_coco_json(target_json)
    for img_idx in fold_results[i]['valid']:
        img_info, anns = coco_dict[img_name]
        valid_fold_json['images'].append(img_info)
        valid_fold_json['annotations'] += anns[:]

    with open(os.path.join(base_dir,f'valid{i}.json'), 'w') as outfile:
        json.dump(valid_fold_json, outfile, indent=2)
    
    print(f'valid{i}.json saved')
print()

# coco format 확인
for i in fold_results.keys():
    dir = os.path.join(base_dir,f'train{i}.json')
    cocoformattest = COCO(dir)
    print(f'!!!train{i}.json coco format test passed!!!')
    print()

    dir = os.path.join(base_dir,f'valid{i}.json')
    cocoformattest = COCO(dir)
    print(f'!!!valid{i}.json coco format test passed!!!')
    print()

train0.json saved
valid0.json saved
train1.json saved
valid1.json saved
train2.json saved
valid2.json saved
train3.json saved
valid3.json saved
train4.json saved
valid4.json saved

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
!!!train0.json coco format test passed!!!

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
!!!valid0.json coco format test passed!!!

loading annotations into memory...
Done (t=0.07s)
creating index...
index created!
!!!train1.json coco format test passed!!!

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
!!!valid1.json coco format test passed!!!

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
!!!train2.json coco format test passed!!!

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
!!!valid2.json coco format test passed!!!

loading annotations into memory...
Done (t=0.02s)
creating inde