In [8]:
from pathlib import Path

In [38]:
def read_summary_file(path):
    summary_list = []
    with open(path, 'r') as f:
        for line in f:
            if (line[-1] == '\n'):
                line = line[:-1]
            summary_list.append(line)
        
    return summary_list

def write_summary_file(path, lines):
    with open(path, 'w') as f:
        for line in lines[:-1]:
            f.write(line+'\n')
        f.write(lines[-1])

In [10]:
dataset_base_path = Path('/home/shlll/Dataset/Teeth/')

teeth_dataset_base_path = dataset_base_path / 'teeth'
teeth_summary_path = teeth_dataset_base_path / "ImageSets/Segmentation"
teeth_train_summary_path = teeth_summary_path / 'train.txt'
teeth_trainval_summary_path = teeth_summary_path / 'trainval.txt'
teeth_val_summary_path = teeth_summary_path / 'val.txt'
teeth_src_image_path = teeth_dataset_base_path / 'JPEGImages'
teeth_gt_image_path = teeth_dataset_base_path / 'SegmentationClass'

swfaug_dataset_base_path = dataset_base_path / 'swfaug'
swfaug_dataset_subpaths = [
    swfaug_dataset_base_path / 'A', 
    swfaug_dataset_base_path / 'B', 
    swfaug_dataset_base_path / 'C'
]

In [15]:
teeth_train_list = read_summary_file(teeth_train_summary_path)
teeth_trainval_list = read_summary_file(teeth_trainval_summary_path)
teeth_val_list = read_summary_file(teeth_val_summary_path)
teeth_src_image_list = set(map(lambda x: x.stem, teeth_src_image_path.glob('*')))
teeth_gt_image_list = set(map(lambda x: x.stem, teeth_gt_image_path.glob('*')))

In [31]:
len(teeth_src_image_list)

1967

In [33]:
len(teeth_gt_image_list)

1967

In [34]:
len(teeth_train_list)

1578

In [27]:
len(teeth_val_list) / len(teeth_train_list)

0.25918884664131814

In [35]:
len(teeth_train_list) + len(teeth_val_list)

1987

In [18]:
len(teeth_trainval_list)

1963

### 处理swfaug数据集

In [62]:
import os
import shutil
import random
from itertools import chain

from PIL import Image
from tqdm.auto import tqdm
import numpy as np

In [17]:
# Rename数据集
for swf_subpath in swfaug_dataset_subpaths:
    prefix = swf_subpath.stem
    gt_path = swf_subpath / 'gt'
    ori_path = swf_subpath / 'ori'
    
    for image in tqdm(gt_path.glob('*')):
        if len(image.name.split('_')) <= 2:
            continue
        image.rename(gt_path / '_'.join(image.name.split('_')[4:6]))
                     
    for image in tqdm(ori_path.glob('*')):
        if len(image.name.split('_')) <= 2:
            continue
        image.rename(ori_path / '_'.join(image.name.split('_')[2:4]))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [34]:
# 先创建符合VOC的目录结构
src_image_path = (swfaug_dataset_base_path / 'JPEGImages')
gt_image_path = (swfaug_dataset_base_path / 'SegmentationClass')

shutil.rmtree(src_image_path)
shutil.rmtree(gt_image_path)

src_image_path.mkdir(parents=False, exist_ok=True)
gt_image_path.mkdir(parents=False, exist_ok=True)
file_lists = []

# 将数据集移动到对应的目录
for swf_subpath in swfaug_dataset_subpaths:
    prefix = swf_subpath.stem
    gt_path = swf_subpath / 'gt'
    ori_path = swf_subpath / 'ori'
    gt_images = set(map(lambda x: x.name, gt_path.glob('*')))
    ori_images = set(map(lambda x: x.name, ori_path.glob('*')))
    images = gt_images & ori_images
    file_lists.extend(list(map(lambda x: x.split('.')[0] + f'_{prefix}_aug', images)))
    
    for image_name in tqdm(images):
        gt_image = gt_path / image_name
        ori_image = ori_path / image_name
        dst_gt_image = gt_image_path / (image_name.split('.')[0] + f'_{prefix}_aug.' + image_name.split('.')[1])
        dst_ori_image = src_image_path / (image_name.split('.')[0] + f'_{prefix}_aug.' + image_name.split('.')[1])
        os.link(gt_image, dst_gt_image)
        os.link(ori_image, dst_ori_image)

# 合并原来的teeth数据集
gt_images = set(map(lambda x: x.stem, teeth_gt_image_path.glob('*')))
ori_images = set(map(lambda x: x.stem, teeth_src_image_path.glob('*')))
images = list(gt_images & ori_images)
file_lists.extend(images)

for path in tqdm(teeth_src_image_path.glob('*')):
    dst_path = src_image_path / path.name
    os.link(path, dst_path)

for path in tqdm(teeth_gt_image_path.glob('*')):
    dst_path = gt_image_path / path.name
    os.link(path, dst_path)

HBox(children=(FloatProgress(value=0.0, max=743.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=308.0), HTML(value='')))




In [36]:
len(file_lists)

3357

In [43]:
summary_path = swfaug_dataset_base_path / "ImageSets/Segmentation"
summary_path.mkdir(parents=True, exist_ok=True)

train_summary_path = summary_path / 'train.txt'
trainval_summary_path = summary_path / 'trainval.txt'
val_summary_path = summary_path / 'val.txt'

random.shuffle(file_lists)
split_point = int(0.2 * len(file_lists))
val_lists = file_lists[:split_point]
train_lists = file_lists[split_point:]

file_lists.sort()
val_lists.sort()
train_lists.sort()

write_summary_file(trainval_summary_path, file_lists)
write_summary_file(train_summary_path, train_lists)
write_summary_file(val_summary_path, val_lists)

In [64]:
for path in tqdm(chain(gt_image_path.glob('*.jpg'))):
    img = Image.open(path)
    dst_path = path.parents[0] / (path.stem + '.png')
    img.save(dst_path) 

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [65]:
for path in tqdm(chain(gt_image_path.glob('*.jpg'), src_image_path.glob('*.png'))):
    path.unlink()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [93]:
# teeth_res = set();
aut_res = set();

for path in tqdm(gt_image_path.glob('*')):
    res = frozenset(np.unique(np.asarray(Image.open(path))).tolist())
    if len(path.stem.split('_')) == 2:
        pass
#         teeth_res.add(res)
    else:
        aut_res.add(res)

aut_res

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




{frozenset({0}), frozenset({0, 1}), frozenset({0, 1, 2})}

In [92]:
for path in tqdm(gt_image_path.glob('*')):
    res = list(np.unique(np.asarray(Image.open(path))).tolist())
    if len(res) > 3:
        ori_path = src_image_path / (path.stem + '.jpg')
        if not ori_path.exists():
            print(ori_path)
        else:
            ori_path.unlink()
        path.unlink()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [94]:
file_lists = list(map(lambda x: x.stem, gt_image_path.glob('*')))

summary_path = swfaug_dataset_base_path / "ImageSets/Segmentation"
summary_path.mkdir(parents=True, exist_ok=True)

train_summary_path = summary_path / 'train.txt'
trainval_summary_path = summary_path / 'trainval.txt'
val_summary_path = summary_path / 'val.txt'

random.shuffle(file_lists)
split_point = int(0.2 * len(file_lists))
val_lists = file_lists[:split_point]
train_lists = file_lists[split_point:]

file_lists.sort()
val_lists.sort()
train_lists.sort()

write_summary_file(trainval_summary_path, file_lists)
write_summary_file(train_summary_path, train_lists)
write_summary_file(val_summary_path, val_lists)