In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    BaggingClassifier, ExtraTreesClassifier, VotingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

#下面两个 boosting 的方法 是目前市面上最常用的，但是不再sklearn 的包里，需要单独用命令行 输入 pip install xgboost lightgbm 进行安装。
import xgboost as xgb
import lightgbm as lgb

# 生成假数据
X, y = make_classification(n_samples=100000, n_features=20, n_informative=15, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定义要比较的算法
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "KNN": KNeighborsClassifier(),
    "Bagging": BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10),
    "Extra Trees": ExtraTreesClassifier(),
    "Voting": VotingClassifier(estimators=[
        ('lr', LogisticRegression()), 
        ('rf', RandomForestClassifier()), 
        ('gnb', DecisionTreeClassifier())
    ], voting='hard'),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": lgb.LGBMClassifier()
}

# 存储结果
results = []

# 训练和评估每个模型
for name, model in models.items():
    # 记录训练时间
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    # 记录预测时间
    start_time = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_time

    # 计算评估指标
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # 保存结果
    results.append({
        "Model": name,
        "Train Time": train_time,
        "Predict Time": predict_time,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

# 转换为 DataFrame 并显示
results_df = pd.DataFrame(results)

# 设置 pandas 显示选项
pd.set_option('display.max_columns', None)  # 显示所有列
pd.set_option('display.width', 1000)        # 设置显示宽度
pd.set_option('display.float_format', '{:.2f}'.format) # 设置小数点精度
print("算法比较结果：")
print(results_df)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 40078, number of negative: 39922
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500975 -> initscore=0.003900
[LightGBM] [Info] Start training from score 0.003900
算法比较结果：
                  Model  Train Time  Predict Time  Accuracy  Precision  Recall  F1 Score
0   Logistic Regression        0.18          0.00      0.81       0.82    0.81      0.81
1                   SVM       16.91          5.88      0.98       0.98    0.99      0.98
2         Decision Tree        2.10          0.00      0.90       0.90    0.90      0.90
3         Random Forest       26.20          0.24      0.97       0.97    0.98  