# Train / Test data split

Out of a total of 1200 iris_extended.csv files, 900 are used for training and 300 for testing.

# Setting output directions

In [4]:
import numpy as np
base_dir = "./iris_runs"
trial_names = np.array([
    ["Mar19_15_29_21_IRIS_GAN_sd50.0_code2_hidden6", "GAN"],
    ["Mar19_16_00_29_IRIS_InfoGAN_sd50.0_code2_hidden6", "InfoGAN"],
    ["Mar18_16_29_30_IRIS_QGAN_sd1.0_nq5_nl20(MC)", "QGAN(with mode collapse)"],
    ["Mar18_15_46_35_IRIS_QGAN_sd1.0_nq5_nl20(noMC)", "QGAN(without mode collapse)"],
    ["Mar18_16_30_05_IRIS_InfoQGAN_sd1.0_nq5_nl20", "InfoQGAN1"],
    #["Mar18_15_46_20_IRIS_InfoQGAN_sd1.0_nq5_nl20", "InfoQGAN2"]
])


In [3]:
def calculate_max_matching_accuracy(pair_counts):
    keys = list(pair_counts.keys())
    values = list(pair_counts.values())
    species = ['setosa', 'versicolor', 'virginica']
    codes = list(set([key[1] for key in keys]))
    max_sum = 0
    best_keys = []
    for i in range(3):
        for j in range(3):
            if i==j:
                continue
            for k in range(3):
                if i==k or j==k:
                    continue
                # species[0] -> codes[i], species[1] -> codes[j], species[2] -> codes[k]
                selected_keys = [(species[0], codes[i]), (species[1], codes[j]), (species[2], codes[k])]
                cur = 0
                cur = cur + ( pair_counts[selected_keys[0]] if selected_keys[0] in pair_counts else 0 )
                cur = cur + ( pair_counts[selected_keys[1]] if selected_keys[1] in pair_counts else 0 )
                cur = cur + ( pair_counts[selected_keys[2]] if selected_keys[2] in pair_counts else 0 )
                if cur > max_sum:
                    max_sum = cur
                    best_keys = selected_keys
    return max_sum / sum(values), best_keys

# 모델의 일관성 평가

In [5]:
import numpy as np
import pandas as pd
import os
import kagglehub
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names")


csv_file = os.path.join("data/IRIS", "iris_train_1.csv")
raw_data_df = pd.read_csv(csv_file)  # 컬럼: Id, SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm, Species

base_dir = "./정리/Scientific Reports/IRIS"

# 피처와 타겟 설정 (Id 컬럼은 제외)
features = raw_data_df[["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]]
target = raw_data_df["Species"]

# Decision tree 모델 학습
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(features, target)

for trial, model in trial_names:
    trial_path = os.path.join(base_dir, trial)
    scalars_path = os.path.join(trial_path, "scalars.csv")
    df = pd.read_csv(scalars_path)
    
    # 3. 네 개의 p_value 컬럼 값의 합이 가장 큰 epoch를 찾습니다.
    # scalars.csv에 'epoch' 컬럼과 네 개의 p_value 컬럼이 있다고 가정합니다.
    df['p_sum'] = (df['SepalLengthCm_p_value'] +
                   df['SepalWidthCm_p_value'] +
                   df['PetalLengthCm_p_value'] +
                   df['PetalWidthCm_p_value'])
    best_epoch_row = df.loc[df['p_sum'].idxmax()]
    best_epoch = int(best_epoch_row['epoch'])
    SepalLengthCm_p_value = best_epoch_row['SepalLengthCm_p_value']
    SepalWidthCm_p_value = best_epoch_row['SepalWidthCm_p_value']
    PetalLengthCm_p_value = best_epoch_row['PetalLengthCm_p_value']
    PetalWidthCm_p_value = best_epoch_row['PetalWidthCm_p_value']
    p_sum = best_epoch_row['p_sum']
    print(f"\n[{model} | {trial}] 선택된 epoch: {best_epoch} p_sum: {p_sum} SepalLengthCm_p_value: {SepalLengthCm_p_value} SepalWidthCm_p_value: {SepalWidthCm_p_value} PetalLengthCm_p_value: {PetalLengthCm_p_value} PetalWidthCm_p_value: {PetalWidthCm_p_value}")
    
    # 4. 해당 epoch의 gen_outputs 파일을 np.loadtxt 로 읽어옵니다.
    numpy_dir = os.path.join(base_dir, trial, "numpy")
    gen_outputs_file = os.path.join(numpy_dir, f"gen_outputs_epoch_{best_epoch}.txt")
    gen_outputs = np.loadtxt(gen_outputs_file)
    
    # 5. 해당 epoch의 gen_codes 파일을 np.loadtxt 로 읽어옵니다.
    gen_codes_file = os.path.join(numpy_dir, f"gen_codes_epoch_{best_epoch}.txt")
    gen_codes = np.loadtxt(gen_codes_file)
    
    # 6. gen_outputs를 decision tree 모델에 넣어 예측값을 구하고,
    #    gen_codes[:, 0]과 예측값을 쌍으로 묶어 등장 횟수를 카운트합니다.
    gen_outputs_pred = dt_clf.predict(gen_outputs)
    
    # gen_codes의 첫 번째 컬럼과 예측값을 튜플로 묶습니다.
    # gen_codes는 소수점 4째 자리에서 반올림
    pairs = list(zip(gen_outputs_pred, np.round(gen_codes[:, 0], 4)))
    pair_counts = Counter(pairs)
    acc, matching = calculate_max_matching_accuracy(pair_counts)
    # print(pair_counts)
    # matching = [(pair, gen_codes[, 0]) 값들의 쌍], 예시: [('setosa', 0.8), ('virginica', -0.8), ('versicolor', 0.0)]

    print(f"{model} {best_epoch} & {SepalLengthCm_p_value:.4f} & {SepalWidthCm_p_value:.4f} & {PetalLengthCm_p_value:.4f} & {PetalWidthCm_p_value:.4f} & {acc:.4f} \\\\ \hline")
    



[GAN | Mar19_15_29_21_IRIS_GAN_sd50.0_code2_hidden6] 선택된 epoch: 417 p_sum: 0.787201111771817 SepalLengthCm_p_value: 0.0214382736732926 SepalWidthCm_p_value: 0.7657628376405496 PetalLengthCm_p_value: 4.159654441918438e-21 PetalWidthCm_p_value: 4.579746690765983e-10
GAN 417 & 0.0214 & 0.7658 & 0.0000 & 0.0000 & 0.5781 \\ \hline

[InfoGAN | Mar19_16_00_29_IRIS_InfoGAN_sd50.0_code2_hidden6] 선택된 epoch: 303 p_sum: 0.8056954081082027 SepalLengthCm_p_value: 9.394321864441318e-11 SepalWidthCm_p_value: 0.8056954080142044 PetalLengthCm_p_value: 1.030023393182618e-19 PetalWidthCm_p_value: 5.5182405224738704e-14
InfoGAN 303 & 0.0000 & 0.8057 & 0.0000 & 0.0000 & 0.8594 \\ \hline

[QGAN(with mode collapse) | Mar18_16_29_30_IRIS_QGAN_sd1.0_nq5_nl20(MC)] 선택된 epoch: 394 p_sum: 0.8740988723659426 SepalLengthCm_p_value: 0.0008021942582858 SepalWidthCm_p_value: 0.8732966780881438 PetalLengthCm_p_value: 9.122181576617383e-14 PetalWidthCm_p_value: 1.9421816275891825e-11
QGAN(with mode collapse) 394 & 0.0008

  from .autonotebook import tqdm as notebook_tqdm


# 일반화

In [19]:
results = {}

In [20]:
results

{}

In [21]:
import numpy as np
import pandas as pd
import os
import kagglehub
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import Counter
import warnings

warnings.filterwarnings("ignore", message="X does not have valid feature names, but")

# 분류 모델 생성 함수: 원하는 분류기 이름에 따라 인스턴스를 생성합니다.
def get_classifier(clf_type):
    if clf_type == "Decision Tree":
        return DecisionTreeClassifier()
    elif clf_type == "k-NN":
        return KNeighborsClassifier(n_neighbors=5)
    elif clf_type == "SVM":
        return SVC(kernel="linear")
    elif clf_type == "Logistic Regression":
        return LogisticRegression(solver="liblinear")
    else:
        raise ValueError(f"Unsupported classifier type: {clf_type}")

# 원하는 분류 모델 지정
classification_model_types = ["Decision Tree", "k-NN", "Logistic Regression", "SVM"]
classification_model_types = ["Decision Tree", "k-NN", "Logistic Regression"]

# CSV 파일에서 train, test 데이터 불러오기.
train_csv = os.path.join("data/IRIS", "iris_train_1.csv")
raw_train_df = pd.read_csv(train_csv)  
train_df = raw_train_df[["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "Species"]]

test_csv = os.path.join("data/IRIS", "iris_test_1.csv")
raw_test_df = pd.read_csv(test_csv)
test_df = raw_test_df[["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "Species"]]

# 원본 train 데이터는 이미 각 클래스가 균등하다고 가정.
# augment_settings: [augmented data 개수, 원본 train 데이터 개수]
augment_settings = np.array([
    [0, 900],
    [300, 900],
    [600, 900],
    [900, 900],
    [1200, 900],
    [1500, 900],
    [1800, 900],
    [2100, 900],
    [2400, 900],
    [2700, 900],
])

final = True # 이 값이 True라면 맨 최종 출력된 codes, outputs에서 불러온다.
expand = 1.1 # codes, outputs를 expand할 배율

num_iterations = 100

# 각 trial별로 실험 진행

def evaluate_model(classification_model_type):
    for trial, model in trial_names:
        trial_path = os.path.join("./정리/Scientific Reports/IRIS", trial)
        gen_outputs = None
        gen_codes = None

        # Epoch 선택: fixed_epoch가 있으면 그 값을 사용, 없으면 scalars.csv에서 선택.
        if final is None:
            scalars_path = os.path.join(trial_path, "scalars.csv")
            df_scalars = pd.read_csv(scalars_path)
            df_scalars['p_sum'] = (df_scalars['SepalLengthCm_p_value'] +
                                df_scalars['SepalWidthCm_p_value'] +
                                df_scalars['PetalLengthCm_p_value'] +
                                df_scalars['PetalWidthCm_p_value'])
            best_epoch_row = df_scalars.loc[df_scalars['p_sum'].idxmax()]
            selected_epoch = int(best_epoch_row['epoch'])
            print(f"\n[{model} | {trial}] selected epoch: {selected_epoch}")
            gen_outputs_file = os.path.join(trial_path, "numpy", f"gen_outputs_epoch_{selected_epoch}.txt")
            gen_codes_file = os.path.join(trial_path, "numpy", f"gen_codes_epoch_{selected_epoch}.txt")
            gen_outputs = np.loadtxt(gen_outputs_file)
            gen_codes = np.loadtxt(gen_codes_file)  
        else:
            gen_outputs = np.loadtxt(os.path.join(trial_path, f"outputs_2700_{expand}.txt"))
            gen_codes = np.loadtxt(os.path.join(trial_path, f"codes_2700_{expand}.txt"))
        
        # gen_codes의 첫 번째 컬럼을 소수점 4자리로 반올림
        rounded_codes = np.round(gen_codes[:, 0], 4)
        
        # 각 augment setting 별 실험 진행 (num_iterations번 반복하여 평균 정확도 계산)
        for setting in augment_settings:
            base_aug_count, raw_train_count = setting  # base_aug_count: 증강 데이터 수, raw_train_count: 원본 train 데이터 수
            acc_list = []
            # 각 클래스별로 선택할 증강 샘플 수
            aug_per_class = base_aug_count // 3 if base_aug_count > 0 else 0
            
            for iteration in range(num_iterations):
                # origin_train은 CSV에서 불러온 train_df 그대로 사용.
                origin_train = train_df.copy()
                augmented_train = origin_train.copy()
                
                if base_aug_count > 0:
                    # Classifier 1: 원본 데이터로 학습
                    clf1 = get_classifier(classification_model_type)
                    X_train_origin = origin_train[["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]]
                    y_train_origin = origin_train["Species"]
                    clf1.fit(X_train_origin, y_train_origin)
                    
                    # gen_outputs 예측 및 매칭 계산 (calculate_max_matching_accuracy 함수는 사전에 정의되어 있다고 가정)
                    gen_outputs_pred = clf1.predict(gen_outputs)
                    pairs = list(zip(gen_outputs_pred, rounded_codes))
                    pair_counts = Counter(pairs)
                    acc_temp, matching = calculate_max_matching_accuracy(pair_counts)
                    mapping = {code: species for species, code in matching}
                    
                    # 매칭 결과에 따라 각 클래스별로 aug_per_class 개의 증강 데이터 선택
                    augmented_samples = []
                    for code_val, species_tag in mapping.items():
                        indices = np.where(np.isclose(rounded_codes, code_val))[0]
                        if len(indices) < aug_per_class:
                            selected_indices = indices
                        else:
                            selected_indices = np.random.choice(indices, size=aug_per_class, replace=False)
                        selected_samples = gen_outputs[selected_indices]
                        df_aug = pd.DataFrame(selected_samples, 
                                            columns=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"])
                        df_aug["Species"] = species_tag
                        augmented_samples.append(df_aug)
                    if len(augmented_samples) > 0:
                        df_augmented = pd.concat(augmented_samples)
                    else:
                        df_augmented = pd.DataFrame(columns=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "Species"])
                    # 원본 train과 증강 데이터를 합쳐 augmented_train 구성.
                    augmented_train = pd.concat([origin_train, df_augmented])
                
                # Classifier 2: augmented_train으로 학습 및 평가.
                clf2 = get_classifier(classification_model_type)
                X_train_aug = augmented_train[["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]]
                y_train_aug = augmented_train["Species"]
                clf2.fit(X_train_aug, y_train_aug)
                
                X_test = test_df[["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]]
                y_test = test_df["Species"]
                y_pred = clf2.predict(X_test)
                acc_test = accuracy_score(y_test, y_pred)
                acc_list.append(acc_test)
            
            avg_acc = np.mean(acc_list)
            # 실제 추가된 증강 데이터 수 = augmented_train의 샘플 수 - 원본 train 데이터 수.
            actual_aug_count = len(augmented_train) - raw_train_count
            if actual_aug_count % 900 == 0:
                print(f"[{model} | {trial}] [{actual_aug_count}+{raw_train_count}] | {classification_model_type} -> Avg Acc: {avg_acc:.4f}")
            results[(model, trial, classification_model_type, actual_aug_count, raw_train_count)] = avg_acc

# ---------------------------------------------------------------------------
# 최종 결과 출력 (정확도 기준 내림차순 정렬)
print("=====================================")

for classification_model_type in classification_model_types:
    evaluate_model(classification_model_type)

for key, acc in sorted(results.items(), key=lambda x: x[1], reverse=True):
    model_name, trial_name, clf_type, aug_count, raw_train_count = key
    print(f"[{model_name} | {trial_name} | {clf_type}] augmented: {aug_count}, raw_train: {raw_train_count} -> Avg Acc: {acc:.4f}")


[GAN | Mar19_15_29_21_IRIS_GAN_sd50.0_code2_hidden6] [0+900] | Decision Tree -> Avg Acc: 0.9215
[GAN | Mar19_15_29_21_IRIS_GAN_sd50.0_code2_hidden6] [900+900] | Decision Tree -> Avg Acc: 0.8870
[GAN | Mar19_15_29_21_IRIS_GAN_sd50.0_code2_hidden6] [1800+900] | Decision Tree -> Avg Acc: 0.8677
[GAN | Mar19_15_29_21_IRIS_GAN_sd50.0_code2_hidden6] [2700+900] | Decision Tree -> Avg Acc: 0.8667
[InfoGAN | Mar19_16_00_29_IRIS_InfoGAN_sd50.0_code2_hidden6] [0+900] | Decision Tree -> Avg Acc: 0.9202
[InfoGAN | Mar19_16_00_29_IRIS_InfoGAN_sd50.0_code2_hidden6] [900+900] | Decision Tree -> Avg Acc: 0.9148
[InfoGAN | Mar19_16_00_29_IRIS_InfoGAN_sd50.0_code2_hidden6] [1800+900] | Decision Tree -> Avg Acc: 0.9120
[InfoGAN | Mar19_16_00_29_IRIS_InfoGAN_sd50.0_code2_hidden6] [2700+900] | Decision Tree -> Avg Acc: 0.9148
[QGAN(with mode collapse) | Mar18_16_29_30_IRIS_QGAN_sd1.0_nq5_nl20(MC)] [0+900] | Decision Tree -> Avg Acc: 0.9212
[QGAN(with mode collapse) | Mar18_16_29_30_IRIS_QGAN_sd1.0_nq5_nl20(

# 다 합쳐서 분석하자

In [22]:
import pandas as pd

# results dict의 키는 (model_name, trial_name, classification_model, aug_count, raw_train_count)이며,
# 값은 해당 setting에서의 accuracy라고 가정합니다.
data = []
for key, acc in results.items():
    model_name, trial_name, classification_model, aug_count, raw_train_count = key
    setting_str = f"{aug_count}"
    data.append({
        "model_name": model_name,
        "classification_model": classification_model,
        "setting": setting_str,
        "accuracy": acc
    })

df = pd.DataFrame(data)
# model_name, classification_model을 인덱스로, 각 setting별 accuracy를 컬럼으로 피벗합니다.
df_pivot = df.pivot_table(index=["model_name", "classification_model"],
                          columns="setting", 
                          values="accuracy", 
                          aggfunc="mean")
df_pivot.reset_index(inplace=True)
df_pivot = df_pivot.reindex(columns=["model_name", "classification_model", "0", "300", "600", "900", "1200", "1500", "1800", "2100", "2400", "2700"])
df_pivot

setting,model_name,classification_model,0,300,600,900,1200,1500,1800,2100,2400,2700
0,GAN,Decision Tree,0.921533,0.905733,0.897,0.886967,0.879767,0.870667,0.867667,0.863333,0.865933,0.866733
1,GAN,Logistic Regression,0.93,0.868567,0.8005,0.736033,0.6907,0.642167,0.627133,0.606533,0.578067,0.566667
2,GAN,k-NN,0.946667,0.924967,0.909967,0.898333,0.893633,0.888433,0.885167,0.882933,0.8809,0.88
3,InfoGAN,Decision Tree,0.920167,0.9202,0.914767,0.914833,0.9123,0.9134,0.912,0.9136,0.915733,0.914767
4,InfoGAN,Logistic Regression,0.93,0.940567,0.935667,0.924067,0.9109,0.898633,0.894533,0.891333,0.886133,0.88
5,InfoGAN,k-NN,0.946667,0.946567,0.945633,0.943367,0.941633,0.939567,0.938567,0.937,0.9353,0.933333
6,InfoQGAN1,Decision Tree,0.921133,0.9282,0.9307,0.934833,0.936633,0.935867,0.936933,0.9371,0.939833,0.947567
7,InfoQGAN1,Logistic Regression,0.93,0.946367,0.950867,0.9545,0.9576,0.958767,0.959033,0.9595,0.959567,0.96
8,InfoQGAN1,k-NN,0.946667,0.9556,0.957533,0.9569,0.9558,0.956767,0.954967,0.9566,0.956667,0.956667
9,QGAN(with mode collapse),Decision Tree,0.921167,0.837867,0.791133,0.7504,0.7322,0.709933,0.692333,0.673067,0.656933,0.642567


In [23]:
df_pivot.to_csv(f"./정리/Scientific Reports/IRIS/iris_results_2700_{expand}.csv", index=False)