In [6]:
# ==================== 資料準備 ====================
import pandas as pd
train_path = "task2_train.csv"
test_path = "task2_test.csv"

df_train = pd.read_csv(f"../input/intro-ml-2025-nccu-task2/{train_path}")
df_test = pd.read_csv(f"../input/intro-ml-2025-nccu-task2/{test_path}")

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor

RANDOM_STATE = 42

print(df_train.head())
print(df_test.head())

# 特徵 & 目標
X = df_train[['x']]        # 注意這裡只用 x
y = df_train['value']

X_test = df_test[['x']]    # 預測用特徵

          x     value
0 -1.555703  0.088271
1  5.588857 -0.735226
2  2.876725  0.230378
3  1.223365  0.005941
4 -4.265369 -0.368457
   id         x
0   1  2.569759
1   2 -4.308516
2   3  0.945976
3   4  1.323267
4   5 -0.940780


In [9]:
kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

In [None]:
from sklearn.linear_model import Ridge

poly_ridge_pipe = Pipeline([
    ('poly',   PolynomialFeatures(include_bias=False)),
    ('scaler', StandardScaler()),
    ('ridge',  Ridge(random_state=RANDOM_STATE))
])

param_grid_poly_ridge = {
    'poly__degree': [1, 2, 3, 4, 5, 6],      # 多項式階數
    'ridge__alpha': [0.01, 0.1, 1, 10, 100]  # 正則化強度
}

gs_poly_ridge = GridSearchCV(
    estimator=poly_ridge_pipe,
    param_grid=param_grid_poly_ridge,
    scoring=mse_scorer,
    cv=kfold,
    n_jobs=-1,
    verbose=0
)

gs_poly_ridge.fit(X, y)

print("Poly+Ridge 最佳參數：", gs_poly_ridge.best_params_)
print("Poly+Ridge CV MSE：", -gs_poly_ridge.best_score_)

Poly+Ridge 最佳參數： {'poly__degree': 6, 'ridge__alpha': 0.01}
Poly+Ridge CV MSE： 0.00151352064014058


In [None]:
gbr = GradientBoostingRegressor(random_state=RANDOM_STATE)

param_grid_gbr = {
    'n_estimators': [200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [2, 3, 4],
    'subsample': [0.8, 1.0]
}

gs_gbr = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid_gbr,
    scoring=mse_scorer,
    cv=kfold,
    n_jobs=-1,
    verbose=0
)

gs_gbr.fit(X, y)

print("GBR 最佳參數：", gs_gbr.best_params_)
print("GBR CV MSE：", -gs_gbr.best_score_)

GBR 最佳參數： {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.8}
GBR CV MSE： 0.0019359744782941727


In [None]:
best_poly_ridge_mse = -gs_poly_ridge.best_score_
best_gbr_mse = -gs_gbr.best_score_

if best_poly_ridge_mse <= best_gbr_mse:
    best_model = gs_poly_ridge.best_estimator_
    print("選擇 Poly+Ridge 作為最終模型，CV MSE =", best_poly_ridge_mse)
else:
    best_model = gs_gbr.best_estimator_
    print("選擇 GBR 作為最終模型，CV MSE =", best_gbr_mse)

選擇 Poly+Ridge 作為最終模型，CV MSE = 0.00151352064014058


In [None]:
# 用全部訓練資料重新訓練最佳模型
best_model.fit(X, y)

# 預測 test
y_pred = best_model.predict(X_test)

# 建立 submission DataFrame，注意欄位名稱要是 "id", "value"
submission = pd.DataFrame({
    'id': df_test['id'],
    'value': y_pred
})

submission.head()

Unnamed: 0,id,value
0,1,0.239311
1,2,-0.410179
2,3,-0.052896
3,4,0.033483
4,5,-0.056635


In [14]:
submission.to_csv("/kaggle/working/submission_v2.csv", index=False)
print("預測結果已輸出到 submission_v2.csv")

預測結果已輸出到 submission_v2.csv
