# データを学習用と検証用に分割して整理する

`README.md`の項目「」

In [19]:
# 画像を扱う機械学習のためのデータセットまとめ
# https://qiita.com/leetmikeal/items/7c0d23e39bf38ab8be23

# Visual Object Classes Challenge 2008 (VOC2008)
# http://host.robots.ox.ac.uk/pascal/VOC/voc2008/index.html

# Dockerコンテナで利用できるリソースや権限を制限する（Dockerの最新機能を使ってみよう：第3回）
# dockerコマンドオプション –shm-size
# https://knowledge.sakura.ad.jp/5118/


from glob import glob
import os
import shutil
from typing import List

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


parent_dir: str = "../VOCdevkit/VOC2008"

# 1. Check files

In [8]:
target_dir: str = "ImageSets/Main"
    
dir_name: str = os.path.join(parent_dir, target_dir)
target_file_name: str = "train.txt"
df = pd.read_csv(os.path.join(dir_name, target_file_name), header=None)
df

Unnamed: 0,0
0,2008_000008
1,2008_000015
2,2008_000019
3,2008_000023
4,2008_000028
...,...
2106,2008_008748
2107,2008_008749
2108,2008_008757
2109,2008_008770


In [9]:
target_file_name: str = "val.txt"
df_val = pd.read_csv(os.path.join(dir_name, target_file_name), header=None, sep=" ")

target_file_name: str = "test.txt"
df_test = pd.read_csv(os.path.join(dir_name, target_file_name), header=None, sep=" ")

target_file_name: str = "trainval.txt"
df_trval = pd.read_csv(os.path.join(dir_name, target_file_name), header=None, sep=" ")

target_file_name: str = "train_train.txt"
df_tr_tr = pd.read_csv(os.path.join(dir_name, target_file_name), header=None, sep=" ")

target_file_name: str = "train_val.txt"
df_tr_val = pd.read_csv(os.path.join(dir_name, target_file_name), header=None, sep=" ")

target_file_name: str = "train_trainval.txt"
df_tr_trval = pd.read_csv(os.path.join(dir_name, target_file_name), header=None, sep=" ")

target_file_name: str = "train_test.txt"
df_tr_test = pd.read_csv(os.path.join(dir_name, target_file_name), header=None, sep=" ")


len(df_val), len(df_test), len(df_trval), len(df_tr_tr), len(df_tr_val), len(df_tr_trval), len(df_tr_test)

(2221, 4133, 4332, 2111, 2221, 4332, 4133)

In [10]:
df_unioned = pd.concat([df_tr_trval, df_tr_test], ignore_index=True)
df_unioned

Unnamed: 0,0,1
0,2008_000002,-1
1,2008_000003,1
2,2008_000007,-1
3,2008_000008,-1
4,2008_000009,-1
...,...,...
8460,2008_008769,0
8461,2008_008771,0
8462,2008_008774,0
8463,2008_008775,0


In [38]:
df_unioned.drop_duplicates(subset=[0]).sort_values(0)

Unnamed: 0,0,1
4332,2008_000001,0
0,2008_000002,-1
1,2008_000003,1
4333,2008_000004,0
4334,2008_000005,0
...,...,...
4330,2008_008772,-1
4331,2008_008773,-1
8462,2008_008774,0
8463,2008_008775,0


In [11]:
target_dir: str = "JPEGImages"
image_paths: List[str] = glob(os.path.join(parent_dir, target_dir, "*"))
len(image_paths)

5096

# 2. move images

In [24]:
parent_dir: str = "../VOCdevkit/VOC2008"
target_dir: str = "ImageSets/Main"
dir_name: str = os.path.join(parent_dir, target_dir)

target_file_name: str = "train_train.txt"
df_tr_tr = pd.read_csv(os.path.join(dir_name, target_file_name), header=None, sep=" ")

target_file_name: str = "train_val.txt"
df_tr_val = pd.read_csv(os.path.join(dir_name, target_file_name), header=None, sep=" ")


In [25]:
def extract_filenames(df: pd.DataFrame, ext: str = '.jpg') -> List[str]:
    filename_ndarray: np.ndarray = df[0].to_numpy()
    filenames: List[str] = [
        filename + ext for filename in filename_ndarray
    ]
    return filenames

train_filenames: List[str] = extract_filenames(df_tr_tr)
val_filenames: List[str] = extract_filenames(df_tr_val)
val_filenames[:5]

['2008_000002.jpg',
 '2008_000003.jpg',
 '2008_000007.jpg',
 '2008_000009.jpg',
 '2008_000016.jpg']

In [26]:
def copy_files(fname_list: List[str], origin_dir: str, target_dir: str):
    for fname in tqdm(fname_list):
        file_path: str = os.path.join(origin_dir, fname)
        shutil.copy(file_path, target_dir)


origin_dir: str = os.path.join(parent_dir, "JPEGImages")

image_train_target_dir: str = "../tr_data/images/train"
image_val_target_dir: str = "../tr_data/images/val"
os.makedirs(image_train_target_dir, exist_ok=True)
os.makedirs(image_val_target_dir, exist_ok=True)
    
copy_files(train_filenames, origin_dir, target_dir=image_train_target_dir)
copy_files(val_filenames, origin_dir, target_dir=image_val_target_dir)

  0%|          | 0/2111 [00:00<?, ?it/s]

  0%|          | 0/2221 [00:00<?, ?it/s]

# 3. move annotation files (yolo formatted)

In [29]:
train_annotation_filenames: List[str] = extract_filenames(df_tr_tr, '.txt')
val_annotation_filenames: List[str] = extract_filenames(df_tr_val, '.txt')
train_annotation_filenames[:5]


['2008_000008.txt',
 '2008_000015.txt',
 '2008_000019.txt',
 '2008_000023.txt',
 '2008_000028.txt']

In [30]:
parent_dir: str = "../tr_data"
origin_dir: str = os.path.join(parent_dir, "format4yolo")

annotation_train_target_dir: str = os.path.join(parent_dir, "labels/train")
annotation_val_target_dir: str = os.path.join(parent_dir, "labels/val")
    
os.makedirs(annotation_train_target_dir, exist_ok=True)
os.makedirs(annotation_val_target_dir, exist_ok=True)

copy_files(train_annotation_filenames, origin_dir, target_dir=annotation_train_target_dir)
copy_files(val_annotation_filenames, origin_dir, target_dir=annotation_val_target_dir)

  0%|          | 0/2111 [00:00<?, ?it/s]

  0%|          | 0/2221 [00:00<?, ?it/s]