In [26]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

# 讀取資料集
dataset_names = []
X_trains = []
y_trains = []
X_tests = []
for folder_name in os.listdir("./Competition_data"):
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv", header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv", header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv", header=0))



In [44]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

# 儲存模型結果
models = []
aucs = []  # 儲存每個資料集的 AUC
y_predict = []
n_permutations = 1000  # 置換次數

# 訓練模型並執行置換檢驗
for i in range(len(dataset_names)):
    # 檢查是否有缺失值，並填充缺失值
    tmp_X_train = X_trains[i].fillna(X_trains[i].mean())  # 用均值填充缺失值
    tmp_X_test = X_tests[i].fillna(X_tests[i].mean())  # 用均值填充缺失值
    tmp_y_train = y_trains[i]
    
    # 特徵標準化
    scaler = StandardScaler()
    tmp_X_train = scaler.fit_transform(tmp_X_train)
    tmp_X_test = scaler.transform(tmp_X_test)
    
    # 訓練-測試資料分割
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(tmp_X_train, tmp_y_train, test_size=0.4, random_state=42)
    
    # 訓練 KNN 模型
    model = KNeighborsClassifier(n_neighbors=3)
    tmp_y_train = tmp_y_train.values.ravel() 
    model.fit(tmp_X_train, tmp_y_train.squeeze())
    
    # 預測和計算 AUC
    tmp_y_prob = model.predict_proba(tmp_X_test)[:, 1]  # 預測概率
    tmp_y_pred = (tmp_y_prob >= 0.5).astype(int)  # 轉換為 0 或 1，閥值為 0.5
    auc = roc_auc_score(tmp_y_test, tmp_y_prob)  # 使用預測機率來計算 AUC
    aucs.append(auc)
    
    # 執行置換檢驗
    permuted_aucs = []
    for _ in range(n_permutations):
        # 置換標籤
        permuted_y_train = np.random.permutation(tmp_y_train)
        model.fit(tmp_X_train, permuted_y_train)
        permuted_y_prob = model.predict_proba(tmp_X_test)[:, 1]
        permuted_auc = roc_auc_score(tmp_y_test, permuted_y_prob)
        permuted_aucs.append(permuted_auc)
    
    # 計算 p-value：原始 AUC 大於置換 AUC 的比例
    p_value = np.mean(np.array(permuted_aucs) >= auc)
    print(f"Dataset: {dataset_names[i]} - AUC: {auc:.4f}, p-value: {p_value:.4f}")
    
    models.append(model)
    y_predict.append(pd.DataFrame(tmp_y_pred))  # 保存 0 或 1 的預測結果

# 計算所有資料集的 AUC 平均值
average_auc = np.mean(aucs)
print(f"Average AUC across all datasets: {average_auc:.4f}")

# 保存預測結果
for idx, dataset_name in enumerate(dataset_names):
    save_path = f'./Competition_data/{dataset_name}/y_predict.csv'
    os.makedirs(os.path.dirname(save_path), exist_ok=True)  # 若資料夾不存在則創建
    df = y_predict[idx]
    df.to_csv(save_path, index=False, header=True)


Dataset: Dataset_1 - AUC: 0.7260, p-value: 0.0000
Dataset: Dataset_10 - AUC: 0.6853, p-value: 0.0000
Dataset: Dataset_11 - AUC: 0.3824, p-value: 0.9090
Dataset: Dataset_12 - AUC: 0.7803, p-value: 0.0000
Dataset: Dataset_13 - AUC: 0.7977, p-value: 0.0000
Dataset: Dataset_14 - AUC: 0.9131, p-value: 0.0000
Dataset: Dataset_15 - AUC: 0.6768, p-value: 0.0000
Dataset: Dataset_16 - AUC: 0.9766, p-value: 0.0000
Dataset: Dataset_17 - AUC: 0.8689, p-value: 0.0000
Dataset: Dataset_18 - AUC: 1.0000, p-value: 0.0000
Dataset: Dataset_19 - AUC: 0.9939, p-value: 0.0000
Dataset: Dataset_2 - AUC: 0.9766, p-value: 0.0000
Dataset: Dataset_20 - AUC: 0.7671, p-value: 0.0010
Dataset: Dataset_21 - AUC: 0.9087, p-value: 0.0000
Dataset: Dataset_22 - AUC: 0.7812, p-value: 0.0070
Dataset: Dataset_23 - AUC: 0.8972, p-value: 0.0000
Dataset: Dataset_24 - AUC: 0.5897, p-value: 0.0120
Dataset: Dataset_25 - AUC: 0.7870, p-value: 0.0000
Dataset: Dataset_26 - AUC: 0.7864, p-value: 0.0000
Dataset: Dataset_27 - AUC: 0.9994