In [45]:
import os
import shutil
from collections import defaultdict

import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

In [46]:
BASE_LABEL_DIRS = [
    "/home/jaehyeonpark/data_labels/1/label_stats",
    "/home/jaehyeonpark/data_labels/2/label_stats",
    "/home/jaehyeonpark/data_labels/3/label_stats",
    "/home/jaehyeonpark/data_labels/4/label_stats",
    "/home/jaehyeonpark/data_labels/5/label_stats",
    "/home/jaehyeonpark/data_labels/6/label_stats",
]

In [47]:
SRC_ROOT = "/home/jaehyeonpark/Downloads/slope_cd"

In [48]:
DST_ROOT = "/home/jaehyeonpark/Downloads/preprocessed_cd"

In [49]:
TRAIN_RATIO = 0.8
RND_STATE   = 42

In [50]:
def collect_classes(label_dirs):
    """모든 label_stats 폴더에서 클래스 이름(확장자 제거) 수집"""
    classes = set()
    for d in label_dirs:
        for fn in os.listdir(d):
            if fn.lower().endswith(".txt"):
                classes.add(os.path.splitext(fn)[0])
    return sorted(classes)


def build_multilabel_df(label_dirs, classes):
    """
    label_dirs: 리스트 of ".../label_stats"
    classes: ['사람','돌',...]
    return: DataFrame ['filename'] + [클래스별 0/1]
    """
    file2cls = defaultdict(set)
    for d in label_dirs:
        for fn in os.listdir(d):
            if not fn.lower().endswith(".txt"): continue
            cls = os.path.splitext(fn)[0]
            with open(os.path.join(d,fn), encoding="utf-8") as f:
                for line in f:
                    img = line.strip()
                    if img:
                        file2cls[img].add(cls)

    rows = []
    for img, cls_set in file2cls.items():
        row = {"filename": img}
        for cls in classes:
            row[cls] = 1 if cls in cls_set else 0
        rows.append(row)
    return pd.DataFrame(rows)

def stratified_split(df, classes):
    """
    Multi‐label Stratified Split (Train:Test = 8:2)
    """
    msss = MultilabelStratifiedShuffleSplit(
        n_splits=1,
        train_size=TRAIN_RATIO,        # 0.8
        test_size=1.0 - TRAIN_RATIO,   # 0.2
        random_state=RND_STATE
    )
    X = df["filename"].values
    Y = df[classes].values
    train_idx, test_idx = next(msss.split(X, Y))

    train_df = df.iloc[train_idx].reset_index(drop=True)
    test_df  = df.iloc[test_idx].reset_index(drop=True)
    return train_df, test_df

def print_dist(df, classes, name):
    total = len(df)
    print(f"\n-- {name} 분포: 총 {total}장 --")
    for cls in classes:
        cnt = int(df[cls].sum())
        print(f" {cls:8s}: {cnt:5d}장 ({cnt/total*100:5.2f}%)")


def make_split_dirs(dst_root, split):
    """dst_root/{split}/T1, T2, GT, label_stats 생성"""
    for sub in ("T1","T2","GT","label_stats"):
        os.makedirs(os.path.join(dst_root, split, sub), exist_ok=True)


def copy_images_and_make_txt(df_split, classes, split):
    """
    1) train.txt/test.txt 생성 (basename)
    2) 이미지 복사 (T1,T2,GT)
       → GT 폴더는 .png가 없으면 .jpg/.jpeg로도 검색
    3) label_stats/*.txt 생성 (filename 한 줄씩)
    """
    # 1) train.txt or test.txt
    list_path = os.path.join(DST_ROOT, f"{split}.txt")
    with open(list_path, "w", encoding="utf-8") as lf:
        for fn in df_split["filename"]:
            basename = os.path.splitext(fn)[0]
            lf.write(basename + "\n")

    # 2) 이미지 복사
    for fn in df_split["filename"]:
        basename = os.path.splitext(fn)[0]

        # T1, T2는 항상 .png
        for sub in ("T1", "T2"):
            src = os.path.join(SRC_ROOT, sub, basename + ".png")
            if not os.path.isfile(src):
                raise FileNotFoundError(f"{src}가 없습니다.")
            dst = os.path.join(DST_ROOT, split, sub, basename + ".png")
            shutil.copy(src, dst)

        # GT: .png → .jpg → .jpeg 순으로 찾아서 복사
        gt_src = None
        for ext in (".png", ".jpg", ".jpeg"):
            candidate = os.path.join(SRC_ROOT, "GT", basename + ext)
            if os.path.isfile(candidate):
                gt_src = candidate
                break
        if gt_src is None:
            raise FileNotFoundError(f"GT 이미지({basename}.png/.jpg/.jpeg) 중 하나도 찾을 수 없습니다.")
        gt_dst = os.path.join(DST_ROOT, split, "GT", os.path.basename(gt_src))
        shutil.copy(gt_src, gt_dst)

    # 3) label_stats/*.txt 작성
    for cls in classes:
        out_path = os.path.join(DST_ROOT, split, "label_stats", f"{cls}.txt")
        with open(out_path, "w", encoding="utf-8") as cf:
            for fn in df_split[df_split[cls] == 1]["filename"]:
                cf.write(fn + "\n")


In [51]:
 # 클래스 목록
classes = collect_classes(BASE_LABEL_DIRS)
print("Classes:", classes)


Classes: ['나무', '돌', '물', '변화없음', '사람', '흙']


In [52]:
df = build_multilabel_df(BASE_LABEL_DIRS, classes)
print(f"총 고유 이미지: {len(df)}장")

총 고유 이미지: 6639장


In [53]:
train_df, test_df = stratified_split(df, classes)
print_dist(train_df, classes, "Train")
print_dist(test_df,  classes, "Test")


-- Train 분포: 총 5304장 --
 나무    :   150장 ( 2.83%)
 돌     :  1027장 (19.36%)
 물     :   474장 ( 8.94%)
 변화없음:   190장 ( 3.58%)
 사람   :  4550장 (85.78%)
 흙     :   636장 (11.99%)

-- Test 분포: 총 1335장 --
 나무    :    37장 ( 2.77%)
 돌     :   257장 (19.25%)
 물     :   119장 ( 8.91%)
 변화없음:    47장 ( 3.52%)
 사람   :  1138장 (85.24%)
 흙     :   159장 (11.91%)


In [54]:
for split in ("train","test"):
    make_split_dirs(DST_ROOT, split)

In [55]:
copy_images_and_make_txt(train_df, classes, "train")
copy_images_and_make_txt(test_df,  classes, "test")

print("\n✅ 데이터셋 준비 완료!", DST_ROOT)


✅ 데이터셋 준비 완료! /home/jaehyeonpark/Downloads/preprocessed_cd
