## 梯度提升GBDT

1 分类任务

In [1]:
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 生成示例分类数据集
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_classes=2, random_state=42)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建 GradientBoostingClassifier 实例
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# 训练模型
gb_classifier.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = gb_classifier.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"GBDT 分类器的准确率: {accuracy:.2f}")

GBDT 分类器的准确率: 0.96


2 回归任务

In [2]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 生成示例回归数据集
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, random_state=42)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建 GradientBoostingRegressor 实例
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# 训练模型
gb_regressor.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = gb_regressor.predict(X_test)

# 计算均方误差
mse = mean_squared_error(y_test, y_pred)
print(f"GBDT 回归器的均方误差: {mse:.2f}")

GBDT 回归器的均方误差: 59.86


## lightGBM

1 分类任务

In [4]:
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. 加载数据集
data = load_breast_cancer()
X = data.data
y = data.target

# 2. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. 创建 LightGBM 数据集
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# 4. 设置参数
params = {
    'objective': 'binary',  # 二分类任务
    'metric': 'binary_logloss',  # 评估指标，使用二元对数损失
    'num_leaves': 31,  # 树的最大叶子节点数
    'learning_rate': 0.05,  # 学习率
    'feature_fraction': 0.9,  # 特征采样比例
    'bagging_fraction': 0.8,  # 数据采样比例
    'bagging_freq': 5,  # 每 5 次迭代进行一次数据采样
    'verbose': -1  # 不显示详细信息
}

# 5. 训练模型
num_round = 100
model = lgb.train(params, train_data, num_round, valid_sets=[test_data])

# 6. 预测
y_pred = model.predict(X_test)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

# 7. 评估模型
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_binary))

Accuracy: 0.9737
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


2 回归任务

In [6]:
import lightgbm as lgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 1. 生成回归数据集
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, noise=0.1, random_state=42)

# 2. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. 创建 LightGBM 数据集
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# 4. 设置参数
params = {
    'objective': 'regression',  # 回归任务
    'metric': 'mse',  # 评估指标，使用均方误差
    'num_leaves': 31,  # 树的最大叶子节点数
    'learning_rate': 0.05,  # 学习率
    'feature_fraction': 0.9,  # 特征采样比例
    'bagging_fraction': 0.8,  # 数据采样比例
    'bagging_freq': 5,  # 每 5 次迭代进行一次数据采样
    'verbose': -1  # 不显示详细信息
}

# 5. 训练模型
num_round = 100
model = lgb.train(params, train_data, num_round, valid_sets=[test_data])

# 6. 预测
y_pred = model.predict(X_test)

# 7. 评估模型
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")

Mean Squared Error: 121.2054
