GBM模型

当前使用数据：
1. 移除了一元特征值，
2. 对二元特征值采用了按众数填充（gpt处理有误），
3. 对多元离散特征值采用了众数填充，
4. 对多元连续特征值采用了均值填充。

当前离散和连续的阈值
continuous_threshold = X_train.shape[0] * 0.05  # 假设连续特征至少有1%的唯一值

lgb定义了训练10轮，没有处理输出最好的模型结果

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

# 加载数据
X = np.load('./data_set/X_train.npy')
Y = np.load('./data_set/y_train.npy')

# 分割数据集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 定义特征索引
binary_features = [8, 11, 13, 14, 15, 17, 19, 20, 22, 24, 25, 26, 27, 28, 29, 30, 38, 39, 40, 41, 42, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 81, 87, 94, 95, 96, 97, 105, 106, 107, 108, 109, 110]
categorical_features = [1, 2, 3, 4, 7, 9, 10, 37, 43, 44, 45, 46, 82, 84, 91, 92, 93, 98, 99, 100, 101, 102, 103, 104]
continuous_features = [0, 34, 35, 36, 83, 85, 88, 89, 90]

# 剔除了一元特征值
# 剔除了缺失值大于80%的特征: [6, 87]
# 剔除了缺失值大于50%的特征: [33, 34]
unary_features = [5, 6, 12, 16, 18, 21, 23, 31, 51, 61, 66, 77, 80, 86, 32, 33]

binary_features = [f for f in binary_features if f not in unary_features]
categorical_features = [f for f in categorical_features if f not in unary_features]
continuous_features = [f for f in continuous_features if f not in unary_features]

# 创建预处理管道
preprocessor = ColumnTransformer(
    transformers=[
        ('bin', SimpleImputer(strategy='most_frequent'), binary_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), continuous_features)
    ]
)

# 完整的处理和训练管道
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(lgb.LGBMClassifier(n_estimators=10, random_state=42, verbose=-1)))
])

# 训练模型
pipeline.fit(X_train, Y_train)

# 预测测试集
Y_pred = pipeline.predict(X_test)

# 评估每个任务的性能
accuracies = [accuracy_score(Y_test[:, i], Y_pred[:, i]) for i in range(Y_test.shape[1])]
for i, acc in enumerate(accuracies, start=1):
    print(f"Accuracy for task {i}: {acc:.4f}")


Accuracy for task 1: 0.6950
Accuracy for task 2: 0.7850
Accuracy for task 3: 0.8050
Accuracy for task 4: 0.8100
Accuracy for task 5: 0.7850
Accuracy for task 6: 0.7450
Accuracy for task 7: 0.7450
Accuracy for task 8: 0.7850
Accuracy for task 9: 0.6000
Accuracy for task 10: 0.7500
Accuracy for task 11: 0.7300




In [None]:
# 定义特征索引
# binary_features = [8, 11, 13, 14, 15, 17, 19, 20, 22, 24, 25, 26, 27, 28, 29, 30, 32, 38, 39, 40, 41, 42, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 81, 87, 94, 95, 96, 97, 105, 106, 107, 108, 109, 110]
# categorical_features = [1, 2, 3, 4, 5, 7, 9, 10, 33, 37, 43, 44, 45, 46, 82, 84, 91, 92, 93, 98, 99, 100, 101, 102, 103, 104]
# continuous_features = [0, 34, 35, 36, 83, 85, 86, 88, 89, 90]

# 剔除了一元特征值
# unary_features = [6, 12, 16, 18, 21, 23, 31, 51, 61, 66, 77, 80]
