### 1. Voting 思想
- 算法不同
- 数据相同

In [1]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import VotingRegressor

In [2]:
"""
Soft Voting/Majority Rule classifier for unfitted estimators.

把鸡蛋放在多个篮子中，
平行赛马

unfitted 未拟合（主动构建的弱分类器）
    - 主动构建一批弱鸡分类器
    - 集成学习策略

overfit 过拟合（训练大了，入戏太深，书呆子，把训练集上的错误也学习了）
    - 训练集表现非常好
    - 测试集表现非常差
    - 模型被训练废了，不能用了！
    
underfit 欠拟合（训练不够，没有充分学习全部有效知识）
    - 训练集表现不够好
    - 测试集表现也不够好
    - 原因：训练不够！！！
    

"""
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])



"""
    参数说明：
        estimators:评估器
        voting='hard':不加权，完全按出现次数投票
        weights:权重
        n_jobs:多线程
        verbose:打印详细日志
"""
eclf1 = VotingClassifier(estimators=[('lr', clf1), 
                                     ('rf', clf2), 
                                     ('gnb', clf3)], 
                         voting='hard')
eclf1 = eclf1.fit(X, y)
print(eclf1.predict(X))

[1 1 1 2 2 2]




### 2. Bagging思想
- bootstrap aggregating(分组聚合)
- 算法相同
- 数据采样不同

In [3]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import BaggingRegressor

In [4]:
"""
A Bagging classifier is an ensemble meta-estimator that fits base
classifiers each on random subsets of the original dataset and then
aggregate their individual predictions (either by voting or by averaging)
to form a final prediction. Such a meta-estimator can typically be used as
a way to reduce the variance of a black-box estimator (e.g., a decision
tree), by introducing randomization into its construction procedure and
then making an ensemble out of it.

Bagging 分类器是适合 base 的集成元评估器分类器分别对原始数据集的随机子集进行分类，
然后汇总他们的个人预测（通过投票或平均）形成最终预测。
这样的元评估器通常可以用作一种减少黑盒评估器方差的方法（例如，决策树），通过将随机化引入其构造过程，以及然后用它做一个合奏。
  """  

        
        

# base estimator
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification

"""
参数说明
    n_samples: 基评估器
    n_features: 特征数量
    n_informative: 重要的特征
    n_redundant: 冗余的特征
"""
X, y = make_classification(n_samples=100, 
                           n_features=4,
                           n_informative=2, 
                           n_redundant=0,
                           random_state=0, 
                           shuffle=False)
"""
参数说明
    estimator: 基评估器
    n_estimators=10: 将基分类器深度拷贝出来10份
    random_state=0: 随机采样
"""
clf = BaggingClassifier(estimator=SVC(), n_estimators=10, random_state=0).fit(X, y)
clf.predict([[0, 0, 0, 0]])

array([1])

### 3. Stacking 双阶段思想
- 整合策略
- Voting和Bagging的整合策略不太好，所以把整合策略交给数据本身

In [5]:
"""
Stacked generalization consists in stacking the output of individual
estimator and use a classifier to compute the final prediction. 
"""
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split

# 加载数据集
X, y = load_iris(return_X_y=True)

# 定义基础模型
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('svr', make_pipeline(StandardScaler(), LinearSVC(random_state=42)))
]

# 定义最终集成模型
clf = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression()
)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=42
)

# 训练模型并评估
score = clf.fit(X_train, y_train).score(X_test, y_test)
print(f"模型在测试集上的准确率: {score:.2f}")

模型在测试集上的准确率: 0.95


### 4. Boosting 思想
- 错题本
- 吾日三省吾身

In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

In [7]:
"""
An AdaBoost [1]_ classifier is a meta-estimator that begins by fitting a
classifier on the original dataset and then fits additional copies of the
classifier on the same dataset but where the weights of incorrectly
classified instances are adjusted such that subsequent classifiers focus
more on difficult cases.

"""

'\nAn AdaBoost [1]_ classifier is a meta-estimator that begins by fitting a\nclassifier on the original dataset and then fits additional copies of the\nclassifier on the same dataset but where the weights of incorrectly\nclassified instances are adjusted such that subsequent classifiers focus\nmore on difficult cases.\n\n'

### 5. 核心集成学习算法
- RandomForestXX

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [9]:
"""
A random forest is a meta estimator that fits a number of decision tree
classifiers on various sub-samples of the dataset and uses averaging to
improve the predictive accuracy and control over-fitting.
Trees in the forest use the best split strategy, i.e. equivalent to passing
`splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeRegressor`.
The sub-sample size is controlled with the `max_samples` parameter if
`bootstrap=True` (default), otherwise the whole dataset is used to build
each tree.

随机森林，不仅对样本随机，同时对特征也做了随机
"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[0, 0, 0, 0]]))

[1]


In [10]:
from xgboost import XGBClassifier
from xgboost import XGBRegressor

In [11]:
"""
    终极推荐：
        - 训练速度
        - 推理速度
        - 处理数据量
        - 结果层面
        - 综合来看，最优！！！！！！
"""
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor

In [12]:
lgb = LGBMClassifier()

lgb.fit(X=X_train, y=y_train)
y_pred = lgb.predict(X=X_test)
lgb_acc = (y_pred == y_test).mean()


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 85
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 4
[LightGBM] [Info] Start training from score -1.080913
[LightGBM] [Info] Start training from score -1.107581
[LightGBM] [Info] Start training from score -1.107581


