# Read All Dataset CSV

In [21]:
import os
import csv
import pandas as pd
import numpy as np

In [22]:
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
for folder_name in os.listdir("./Competition_data"):
    # print(folder_name)
    dataset_names.append(folder_name)
    X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
    y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
    X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))

## Data Preprocessing & Feature Engineering

In [23]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

# 資料前處理與特徵工程
# 我們會進行缺失值處理、標準化與編碼
X_trains_processed = []
X_tests_processed = []

for i in range(len(dataset_names)):
    # 1. 將 X_train 和 X_test 轉換為 DataFrame 格式
    X_train_df = pd.DataFrame(X_trains[i].copy())
    X_test_df = pd.DataFrame(X_tests[i].copy()) 

    # 2. 缺失值處理
    imputer = SimpleImputer(strategy="mean")  #用「均值填補」的方式來處理缺失值，即將資料列中的缺失值填充為該列的平均值。
    X_train_df = imputer.fit_transform(X_train_df)#這一步會在訓練資料集（X_train_df）中計算出每一列的均值並用其填補缺失值。
    X_test_df = imputer.transform(X_test_df)#這一步使用訓練資料集（X_train_df）計算的均值來填補測試資料集（X_test_df）的缺失值。這樣保證測試集的處理與訓練集一致。


    X_train_df = pd.DataFrame(X_train_df)  # 轉回 DataFrame 以便進行 select_dtypes
    X_test_df = pd.DataFrame(X_test_df)    # 轉回 DataFrame 以便進行 select_dtypes

    #目標：將資料中的類別型特徵轉換為數字型資料。原因：許多機器學習模型（如隨機森林、SVM等）只能處理數字資料，因此需要將資料中的類別型特徵（例如文字型的類別）轉換成數字。
    for col in X_train_df.select_dtypes(include=['object']).columns:#所有的 object 類型資料（通常為類別型資料）。這裡假設類別型特徵的資料型態是 object，例如 'A', 'B', 'C' 等。
        encoder = LabelEncoder()
        X_train_df[col] = encoder.fit_transform(X_train_df[col])#對訓練資料（X_train_df）中的每個類別型欄位進行編碼。
        X_test_df[col] = encoder.transform(X_test_df[col])#對測試資料（X_test_df）進行編碼，這樣測試資料的編碼會使用與訓練資料相同的映射。

    # 4. 標準化數值特徵
    scaler = StandardScaler()
    X_train_df = scaler.fit_transform(X_train_df)
    X_test_df = scaler.transform(X_test_df)

    # 將處理過的資料存入 list 中
    X_trains_processed.append(X_train_df)
    X_tests_processed.append(X_test_df)


## train test split & build Model
You can select an appropriate model and perform corresponding hyperparameter tuning.

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score

In [25]:
models=[]
for i in range(len(dataset_names)):
    tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_trains[i], y_trains[i], test_size=0.2, random_state=42)
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(tmp_X_train, tmp_y_train.squeeze())
    tmp_y_prob = model.predict_proba(tmp_X_test)[:, 1]
    auc = roc_auc_score(tmp_y_test, tmp_y_prob)
    models.append(model)



## Inference Model

In [26]:
y_predicts=[]
for i in range(len(dataset_names)):
    y_predict_proba=models[i].predict_proba(X_tests[i])[:, 1]
    df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
    y_predicts.append(df)
    

## Save result

In [30]:
for idx,dataset_name in enumerate(dataset_names):
    df=y_predicts[idx]
    df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False,header=True)