In [1]:
import numpy as np
import pandas as pd

FLAML参数使用请查询https://github.com/microsoft/FLAML

In [2]:
!pip install flaml

Collecting flaml
  Downloading FLAML-0.9.5-py3-none-any.whl (142 kB)
[K     |████████████████████████████████| 142 kB 598 kB/s 
Collecting scikit-learn>=0.24
  Downloading scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
[K     |████████████████████████████████| 24.8 MB 556 kB/s 
Collecting xgboost<=1.3.3,>=0.90
  Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
[K     |████████████████████████████████| 157.5 MB 66 kB/s 
Installing collected packages: scikit-learn, xgboost, flaml
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.2
    Uninstalling scikit-learn-0.23.2:
      Successfully uninstalled scikit-learn-0.23.2
  Attempting uninstall: xgboost
    Found existing installation: xgboost 1.4.2
    Uninstalling xgboost-1.4.2:
      Successfully uninstalled xgboost-1.4.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that ar

In [3]:
from flaml import AutoML

In [4]:
train = pd.read_csv('../input/mercedes-benz-greener-manufacturing/train.csv.zip')
test = pd.read_csv('../input/mercedes-benz-greener-manufacturing/test.csv.zip')
submission = pd.read_csv('../input/mercedes-benz-greener-manufacturing/sample_submission.csv.zip')

In [5]:
print("\ntrain shape",train.shape)
print("\ntest shape", test.shape)
print("\nsubmission", submission.shape)


train shape (4209, 378)

test shape (4209, 377)

submission (4209, 2)


# 使用label encoder进行转换

In [6]:
from sklearn.preprocessing import LabelEncoder
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [7]:
X=train.drop(['y'],axis=1)
y=train['y']
print(X.shape)
print(y.shape,type(y))

(4209, 377)
(4209,) <class 'pandas.core.series.Series'>


In [8]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(X,y,test_size=0.2)
print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

(3367, 377)
(842, 377)
(3367,)
(842,)


# 调用模型实例，训练模型，进行预测

In [9]:
automl = AutoML()
automl_settings = {
    "time_budget": 10,  # in seconds
    "metric": 'r2',
    "task": 'regression'
}
automl.fit(X_train=train_x, y_train=train_y,
           **automl_settings)
print(automl.predict(train_x).shape)

# Export the best model
print(automl.model)

[flaml.automl: 01-28 08:20:55] {2007} INFO - task = regression
[flaml.automl: 01-28 08:20:55] {2009} INFO - Data split method: uniform
[flaml.automl: 01-28 08:20:55] {2013} INFO - Evaluation method: holdout
[flaml.automl: 01-28 08:20:55] {2113} INFO - Minimizing error metric: 1-r2
[flaml.automl: 01-28 08:20:55] {2170} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl: 01-28 08:20:55] {2437} INFO - iteration 0, current learner lgbm


[flaml.automl: 01-28 08:20:57] {2551} INFO - Estimated sufficient time budget=19365s. Estimated necessary time budget=166s.
[flaml.automl: 01-28 08:20:57] {2603} INFO -  at 2.7s,	estimator lgbm's best error=0.6831,	best estimator lgbm's best error=0.6831
[flaml.automl: 01-28 08:20:57] {2437} INFO - iteration 1, current learner lgbm
[flaml.automl: 01-28 08:20:57] {2603} INFO -  at 2.8s,	estimator lgbm's best error=0.6831,	best estimator lgbm's best error=0.6831
[flaml.automl: 01-28 08:20:57] {2437} INFO - iteration 2, current learner lgbm
[flaml.automl: 01-28 08:20:57] {2603} INFO -  at 2.8s,	estimator lgbm's best error=0.4742,	best estimator lgbm's best error=0.4742
[flaml.automl: 01-28 08:20:57] {2437} INFO - iteration 3, current learner lgbm
[flaml.automl: 01-28 08:20:57] {2603} INFO -  at 2.9s,	estimator lgbm's best error=0.3983,	best estimator lgbm's best error=0.3983
[flaml.automl: 01-28 08:20:57] {2437} INFO - iteration 4, current learner lgbm
[flaml.automl: 01-28 08:20:57] {2603

(3367,)
<flaml.model.LGBMEstimator object at 0x7f7d9e1d3f90>


In [10]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 27, 'num_leaves': 9, 'min_child_samples': 17, 'learning_rate': 0.13233424079596745, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0195153320494843, 'reg_lambda': 1.341231512833295}
Best accuracy on validation data: 0.6076
Training duration of best run: 0.08466 s


In [11]:
test_predict = automl.predict(test)
submission['y']=test_predict
submission.to_csv('submission.csv', index=False)

In [12]:
automl.best_config

{'n_estimators': 27,
 'num_leaves': 9,
 'min_child_samples': 17,
 'learning_rate': 0.13233424079596745,
 'log_max_bin': 8,
 'colsample_bytree': 1.0,
 'reg_alpha': 0.0195153320494843,
 'reg_lambda': 1.341231512833295}

# 讲上述[9]~[11]的代码反复运行，获得多组参数，并将其中最好的参数拿来使用

In [13]:
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [14]:
lgbm = LGBMRegressor(n_estimators= 40, 
          num_leaves= 46, 
          min_child_samples= 13,
          learning_rate= 0.0955054451824824, 
          log_max_bin= 10, 
          colsample_bytree= 1.0, 
          reg_alpha= 0.005626855491108521, 
          reg_lambda= 11.660799315054836)

In [15]:
xgboost = XGBRegressor(n_estimators= 2,         
                     max_leaves=8,         
                     min_child_weight= 1.888631728256368,         
                     learning_rate=1.0,         
                     subsample= 0.86572740032985,         
                     colsample_bylevel= 1.0,         
                     colsample_bytree= 1.0,         
                     reg_alpha= 0.0014067552771442214,         
                     tree_method='gpu_hist',         
                     num_boost_round= 2,         
                     gpu_id=0,         
                     reg_lambda= 0.0182039394877554)

In [16]:
stack_gen = StackingCVRegressor(regressors=(xgboost,lgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [17]:
from datetime import datetime

In [18]:
print('进行模型参数训练 START Fit')

print(datetime.now(), '对stack_gen集成器模型进行参数训练')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))

进行模型参数训练 START Fit
2022-01-28 08:21:07.908174 对stack_gen集成器模型进行参数训练


In [19]:
stacking_predict = stack_gen_model.predict(np.array(test))

In [20]:
xgboost.fit(train_x, train_y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
             colsample_bynode=1, colsample_bytree=1.0, gamma=0, gpu_id=0,
             importance_type='gain', interaction_constraints='',
             learning_rate=1.0, max_delta_step=0, max_depth=6, max_leaves=8,
             min_child_weight=1.888631728256368, missing=nan,
             monotone_constraints='()', n_estimators=2, n_jobs=2,
             num_boost_round=2, num_parallel_tree=1, random_state=0,
             reg_alpha=0.0014067552771442214, reg_lambda=0.0182039394877554,
             scale_pos_weight=1, subsample=0.86572740032985,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [21]:
xgboost_predict = xgboost.predict(test)

In [22]:
lgbm.fit(train_x,train_y)



LGBMRegressor(learning_rate=0.0955054451824824, log_max_bin=10,
              min_child_samples=13, n_estimators=40, num_leaves=46,
              reg_alpha=0.005626855491108521, reg_lambda=11.660799315054836)

In [23]:
lgbm_predict = lgbm.predict(test)

# 以下的三个模型的权重是，通过几次实验“感觉”出来的，出来submission文件的private score在0.55279

In [24]:
submission['y']=0.2*stacking_predict+0.5*xgboost_predict+0.3*lgbm_predict
submission.to_csv('stacking_submission.csv',index=False)