In [1]:
# ==================== 資料準備 ====================
import pandas as pd
train_path = "task4_train.csv"
test_path = "task4_test.csv"

df_train = pd.read_csv(f"../input/intro-ml-2025-nccu-task-4/{train_path}")
df_test = pd.read_csv(f"../input/intro-ml-2025-nccu-task-4/{test_path}")

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

print(df_train.head())
print(df_test.head())

        x_1       x_2       x_3       x_4       x_5       x_6       x_7  \
0  0.333444 -9.336650 -6.888766  3.369977  2.691888 -0.376606 -1.829312   
1  0.248989 -7.643586 -4.437030  4.461310 -0.099799 -0.894289 -2.015378   
2  0.467403  3.160744  7.471960  0.917924 -2.068755 -0.445085  4.735611   
3 -1.759205  1.604205  0.945888 -0.403433  1.381243  0.680195  1.881954   
4  1.520049 -7.401678 -5.664986  6.031521  1.224718 -0.002912 -0.206706   

        x_8       x_9      x_10 label  
0  1.964839 -1.006556 -5.198968     D  
1  1.427611 -5.867180 -5.136508     D  
2 -0.687857 -3.190333  9.219016     B  
3 -0.156426  3.514367 -3.416243     C  
4  3.355252 -3.325993  1.110367     D  
   id       x_1       x_2       x_3       x_4       x_5       x_6       x_7  \
0   1 -4.065582  0.419357 -8.809746  4.553107  3.328161 -0.406100  3.887675   
1   2 -5.644211 -9.238498  8.144450 -1.749083  1.152026  2.469161  2.055010   
2   3 -5.024622 -8.977388  8.052620 -1.838548  1.663175  1.066181  5.778

In [3]:
# 要丟掉的欄位（label 一定要丟，id 若存在也一起丟）
drop_cols = ['label']
if 'id' in df_train.columns:
    drop_cols.append('id')

feature_cols = [c for c in df_train.columns if c not in drop_cols]

X = df_train[feature_cols]
y = df_train['label']

# 測試集特徵
X_test = df_test[feature_cols]

# 將 label 做編碼 (A,B,C,D,E -> 0,1,2,3,4)
le = LabelEncoder()
y_enc = le.fit_transform(y)
num_classes = len(le.classes_)

X.shape, X_test.shape, num_classes

((8000, 10), (2000, 10), 5)

In [4]:
try:
    from xgboost import XGBClassifier
    use_xgb = True
except ImportError:
    use_xgb = False

if use_xgb:
    base_model = XGBClassifier(
        objective="multi:softprob",
        num_class=num_classes,
        eval_metric="mlogloss",
        tree_method="hist",
        learning_rate=0.05,
        max_depth=5,
        n_estimators=500,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
else:
    base_model = RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        n_jobs=-1,
        random_state=42
    )

base_model


In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    base_model,
    X,
    y_enc,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)

print(f"Base model CV accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


Base model CV accuracy: 0.9992 ± 0.0009


In [6]:
if use_xgb:
    param_dist = {
        "n_estimators":    [300, 500, 800, 1000],
        "max_depth":       [3, 5, 7, 9],
        "learning_rate":   [0.01, 0.03, 0.05, 0.1],
        "subsample":       [0.7, 0.8, 1.0],
        "colsample_bytree":[0.7, 0.8, 1.0],
        "min_child_weight":[1, 3, 5],
        "gamma":           [0, 0.1, 0.3]
    }
else:
    # 針對 RandomForest 的參數空間（沒有 xgboost 的備案）
    param_dist = {
        "n_estimators":      [300, 500, 800, 1000],
        "max_depth":         [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf":  [1, 2, 4],
        "max_features":      ["auto", "sqrt", 0.5]
    }

random_search = RandomizedSearchCV(
    base_model,
    param_distributions=param_dist,
    n_iter=25,                 # 可依時間調整，越大越精細
    scoring="accuracy",
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X, y_enc)

print("Best CV accuracy:", random_search.best_score_)
print("Best params:", random_search.best_params_)

best_model = random_search.best_estimator_


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best CV accuracy: 0.999625
Best params: {'subsample': 0.8, 'n_estimators': 1000, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.7}


In [7]:
# 在整個訓練集上重訓
best_model.fit(X, y_enc)

# 對測試集預測（輸出的是編碼後的 0~4）
test_pred_enc = best_model.predict(X_test)

# 反編碼還原為 A,B,C,D,E
test_pred_label = le.inverse_transform(test_pred_enc)


In [8]:
# 確認 id 欄位存在
if "id" in df_test.columns:
    id_col = df_test["id"]
else:
    # 如果 test 沒有 id，就用 index 當 id（依競賽規則自行調整）
    id_col = np.arange(len(df_test))

submission = pd.DataFrame({
    "id": id_col,
    "label": test_pred_label
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,id,label
0,1,E
1,2,A
2,3,A
3,4,C
4,5,D
