In [None]:

import warnings
warnings.filterwarnings('ignore')

import time
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, HTML
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import (
    mean_squared_error, r2_score, mean_absolute_error,
    explained_variance_score, mean_absolute_percentage_error
)
from lightgbm import ##"Model name"##


# -------------------------
RANDOM_STATE = 42
DATA_PATH = ##"name.xlsx"##
TARGET_COLUMN = ##"name"##
MODEL_CLASS = LGBMRegressor
USE_RANDOM_SEARCH = True   # True: RandomizedSearchCV; False: GridSearchCV
N_ITER_SEARCH = 40
CV_FOLDS = 5
TEST_SIZE = 0.2
VALID_SIZE = 0.1          
N_JOBS = -1
VERBOSE = 2

param_distributions = {
    'n_estimators': ##"  "##,
    'learning_rate': ##"  "##,
    'max_depth': ##"  "##,
    'num_leaves': ##"  "##,
    'subsample': ##"  "##,
    'colsample_bytree': ##"  "##,
    'min_child_samples': ##"  "##
}

param_grid_small = {
    'n_estimators': ##"  "##,
    'learning_rate': ##"  "##,
    'max_depth': ##"  "##,
}

np.random.seed(RANDOM_STATE)

# helper: scrollable html display for DataFrame
def show_scrollable_df(df, height=250, title=None):
    html = df.to_html(index=False)
    if title:
        display(HTML(f"<h4>{title}</h4>"))
    display(HTML(f"<div style='height:{height}px; overflow:auto; border:1px solid #ccc; padding:6px'>{html}</div>"))


if data.isnull().any().any():
    print('\n警告：数据含缺失值，建议预处理（填充/删除）。缺失值计数：')
    show_scrollable_df(data.isnull().sum().reset_index().rename(columns={'index':'Column', 0:'MissingCount'}), height=200)
else:
    print('\n未检测到缺失值。')



X = data.drop(columns=[TARGET_COLUMN])
y = data[TARGET_COLUMN].copy()

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
print(f"训练集（全）: {X_train_full.shape[0]} 样本")
print(f"测试集: {X_test.shape[0]} 样本")

show_scrollable_df(pd.concat([X_train_full.head(5), y_train_full.head(5)], axis=1).reset_index(drop=True),
                   height=180, title='训练集示例（前5行）')

# %%




base_model = MODEL_CLASS(random_state=RANDOM_STATE)

if USE_RANDOM_SEARCH:
    print('使用 RandomizedSearchCV')
    searcher = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_distributions,
        n_iter=N_ITER_SEARCH,
        scoring='neg_mean_squared_error',
        cv=CV_FOLDS,
        verbose=VERBOSE,
        n_jobs=N_JOBS,
        random_state=RANDOM_STATE,
        refit=True
    )
else:
    print('使用 GridSearchCV (小网格)')
    searcher = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid_small,
        scoring='neg_mean_squared_error',
        cv=CV_FOLDS,
        verbose=VERBOSE,
        n_jobs=N_JOBS,
        refit=True
    )

start_time = time.time()
searcher.fit(X_train_full, y_train_full)
print(f"超参搜索完成，耗时: {(time.time()-start_time)/60:.2f} 分钟")
print('最佳参数:')
print(searcher.best_params_)
print('最佳 CV 得分 (neg MSE):', searcher.best_score_)

best_params = searcher.best_params_


kf = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
fold_results = []
best_fold_r2 = -np.inf
best_fold_index = None
best_y_val = None
best_y_pred_val = None

for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_train_full), 1):
    X_train = X_train_full.iloc[train_idx]
    X_val = X_train_full.iloc[val_idx]
    y_train = y_train_full.iloc[train_idx]
    y_val = y_train_full.iloc[val_idx]

    model = MODEL_CLASS(random_state=RANDOM_STATE, **best_params)
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)

    mse = mean_squared_error(y_val, y_val_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    evs = explained_variance_score(y_val, y_val_pred)
    mape = mean_absolute_percentage_error(y_val, y_val_pred)

    fold_results.append({'Fold': fold_idx, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2, 'Explained Variance': evs, 'MAPE': mape})

    if r2 > best_fold_r2:
        best_fold_r2 = r2
        best_fold_index = fold_idx
        best_y_val = y_val
        best_y_pred_val = y_val_pred

    print(f"Fold {fold_idx}: R2={r2:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}, MAPE={mape:.4f}")

cv_results_df = pd.DataFrame(fold_results)
print('\n交叉验证结果:')
show_scrollable_df(cv_results_df.round(4), height=220, title='KFold 每折指标')

print('\n平均指标:')
display(cv_results_df.mean(numeric_only=True).round(4))



X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=VALID_SIZE, random_state=RANDOM_STATE
)

final_model = MODEL_CLASS(random_state=RANDOM_STATE, **best_params)


try:
    import lightgbm as lgb
    final_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
    )
    print('final_model.fit() executed with LightGBM callbacks (4.x API).')
except Exception:
    try:
        final_model.fit(X_train, y_train,
                        eval_set=[(X_val, y_val)],
                        early_stopping_rounds=50,
                        verbose=False)
        print('final_model.fit() executed with early_stopping_rounds.')
    except TypeError:
        final_model.fit(X_train, y_train)
        print('final_model.fit() executed without early stopping (not supported by this estimator).')


