In [None]:
import h2o
from h2o.automl import H2OAutoML
import os

# 1. 启动 H2O 并设置日志目录
h2o.init(log_dir="classifier_fast_logs", log_level="INFO")

# 2. 导入数据
data = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
x = data.columns[:-1]
y = "class"
data[y] = data[y].asfactor()

# 3. 划分数据集
train, test = data.split_frame(ratios=[0.8], seed=1234)

# 4. 训练 AutoML
aml = H2OAutoML(max_runtime_secs=60, max_models=5, seed=1, project_name="iris_classification")
aml.train(x=x, y=y, training_frame=train)

# 7. 查看排行榜
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))


# 日志已自动保存在 h2o_logs 目录


In [None]:

# 8. 保存所有模型
model_dir = "h2o_classifier_fast_models"
os.makedirs(model_dir, exist_ok=True)

# 获取所有模型ID
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])

for mid in model_ids:
    model = h2o.get_model(mid)
    model_path = h2o.save_model(model=model, path=model_dir, force=True)
    print(f"Saved model {mid} to {model_path}")

# 5. 评估模型
print(aml.leaderboard)
perf = aml.leader.model_performance(test)
print(perf)

# 6. 保存模型
model_path = h2o.save_model(model=aml.leader, path="./saved_model", force=True)
print(f"Model saved to: {model_path}")

In [None]:
aml.leaderboard

In [None]:
h2o.get_model("GLM_1_AutoML_1_20250508_154028")

In [None]:
# 1. 获取排行榜第一名模型的ID
model_id = lb.as_data_frame().iloc[0, 0]  # 第一行第一列是 model_id

# 2. 加载该模型
model = h2o.get_model(model_id)
data1 = test.as_data_frame()
# 3. 用模型对 test 数据集做预测
pred = model.predict(data1)

# 4. 查看预测结果
print(pred.head())

In [None]:
import mlflow

mlflow.set_tracking_uri("http://9.134.212.179:5000")
mlflow.set_experiment("h2o_test_1")

with mlflow.start_run():
    mlflow.h2o.log_model(model, "my_model")



In [None]:
# 2. 构造模型 artifact 路径
model_uri = f"runs:/4b78875831d8428faa510fa276df5719/my_model"

# 3. 加载模型
model1 = mlflow.h2o.load_model(model_uri)

In [None]:
model1.predict(test)

In [None]:
test

In [None]:
import h2o
from wedata.automl.h2o import WeDataH2oAutoML
import os

# 1. 启动 H2O 并设置日志目录
h2o.init(log_dir="classifier_fast_logs", log_level="INFO")

# 2. 导入数据
data = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
x = data.columns[:-1]
y = "class"
data[y] = data[y].asfactor()

# 3. 划分数据集
train, test = data.split_frame(ratios=[0.8], seed=1234)

# 4. 训练 AutoML
# aml = H2OAutoML(max_runtime_secs=60, max_models=5, seed=1, project_name="iris_classification")
# aml.train(x=x, y=y, training_frame=train)
import mlflow

mlflow.set_tracking_uri("http://9.134.212.179:5000")
mlflow.set_experiment("h2o_test_2")

waml = WeDataH2oAutoML(max_runtime_secs=60, max_models=5, seed=1, project_name="iris_classification")
waml.train(x=x, y=y, training_frame=train)


# 7. 查看排行榜
lb = waml.leaderboard
print(lb.head(rows=lb.nrows))


# 日志已自动保存在 h2o_logs 目录


In [4]:
import numpy as np
from wedata.ts_automl.flaml import WeDataTimeSeriesAutoML
import mlflow
import flaml
from flaml import AutoML
mlflow.set_tracking_uri("http://9.134.212.179:5000")
# 构造时序数据
X_train = np.arange("2014-01", "2022-01", dtype="datetime64[M]")
y_train = np.random.random(size=84)
# WeDataTimeSeriesAutoML 训练设置：最大训练时间为 60s，评价指标为 acc，每个 Executor 的并行度为 2，开启强制取消（开启后，超过最大训练时间后将立即停止）
automl_settings = {
    "time_budget": 10,
    "metric": 'accuracy',
    "n_concurrent_trials": 1,
    "use_spark": True,
    "force_cancel": False,  # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.
}
automl = WeDataTimeSeriesAutoML(settings=automl_settings)
mlflow.set_experiment("wedata_demo")

# 开始训练，请设置 mlflow 实验名和任务名，训练中将同步记录模型参数到实验管理，同时记录最优模型制品
automl.fit(
    X_train=X_train[:84],  # a single column of timestamp
    y_train=y_train,  # value for each timestamp
    period=12,  # time horizon to forecast, e.g., 12 months
    task="ts_forecast",
    log_file_name="ts_forecast.log",
    eval_method="holdout",
    use_spark=True,
    mlflow_experiment_name="wedata_demo",
    mlflow_run_name="test_ts_1"
)


[flaml.automl.logger: 06-26 15:54:12] {1728} INFO - task = ts_forecast
[flaml.automl.logger: 06-26 15:54:12] {1739} INFO - Evaluation method: holdout
[flaml.automl.logger: 06-26 15:54:12] {1838} INFO - Minimizing error metric: mape
[flaml.automl.logger: 06-26 15:54:12] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'arima', 'sarimax', 'prophet']


[I 2025-06-26 15:54:12,932] A new study created in memory with name: optuna
2025/06/26 15:54:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run get_mlflow_log_latency at: http://9.134.212.179:5000/#/experiments/0/runs/12ed27d27c1e428f99775e63f7dadc25.
2025/06/26 15:54:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://9.134.212.179:5000/#/experiments/0.


[flaml.automl.logger: 06-26 15:54:13] {2118} INFO - Estimated mlflow_log_latency: 0.8534200191497803 seconds.


[I 2025-06-26 15:54:13,794] A new study created in memory with name: optuna


[flaml.tune.tune: 06-26 15:54:13] {796} INFO - Number of trials: 1/8, 1 RUNNING, 0 TERMINATED
[flaml.tune.tune: 06-26 15:54:13] {819} INFO - Brief result: {'pred_time': 0.0015996893246968587, 'wall_clock_time': 0.9734647274017334, 'metric_for_logging': {'pred_time': 0.0015996893246968587}, 'val_loss': 2.979735271660923, 'trained_estimator': <flaml.automl.time_series.ts_model.LGBM_TS object at 0x7fa07c1e62b0>}
[flaml.tune.tune: 06-26 15:54:13] {796} INFO - Number of trials: 2/8, 1 RUNNING, 1 TERMINATED


                                                                                

[flaml.tune.tune: 06-26 15:54:17] {819} INFO - Brief result: {'pred_time': 0.01662011941274007, 'wall_clock_time': 4.496348857879639, 'metric_for_logging': {'pred_time': 0.01662011941274007}, 'val_loss': 2.7281412126198443, 'trained_estimator': <flaml.automl.time_series.ts_model.RF_TS object at 0x7fa07c1ee2e0>}
[flaml.tune.tune: 06-26 15:54:17] {796} INFO - Number of trials: 3/8, 1 RUNNING, 2 TERMINATED
[flaml.tune.tune: 06-26 15:54:19] {819} INFO - Brief result: {'pred_time': 0.00552139679590861, 'wall_clock_time': 6.38783860206604, 'metric_for_logging': {'pred_time': 0.00552139679590861}, 'val_loss': 2.7611331939697266, 'trained_estimator': <flaml.automl.time_series.ts_model.XGBoost_TS object at 0x7fa07c160790>}
[flaml.tune.tune: 06-26 15:54:19] {796} INFO - Number of trials: 4/8, 1 RUNNING, 3 TERMINATED
[flaml.tune.tune: 06-26 15:54:21] {819} INFO - Brief result: {'pred_time': 0.01739680767059326, 'wall_clock_time': 8.193345069885254, 'metric_for_logging': {'pred_time': 0.0173968076

15:54:30 - cmdstanpy - INFO - Chain [1] start processing
15:54:30 - cmdstanpy - INFO - Chain [1] done processing


[flaml.tune.tune: 06-26 15:54:31] {819} INFO - Brief result: {'pred_time': 0.0839068094889323, 'wall_clock_time': 18.863497018814087, 'metric_for_logging': {'pred_time': 0.0839068094889323}, 'val_loss': 1.8494102139450683, 'trained_estimator': <flaml.automl.time_series.ts_model.Prophet object at 0x7fa07c121580>}


2025/06/26 15:54:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run test_ts_1_child_0 at: http://9.134.212.179:5000/#/experiments/0/runs/77cbf48c1242422b83a58402b6fe3634.
2025/06/26 15:54:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://9.134.212.179:5000/#/experiments/0.
2025/06/26 15:54:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run test_ts_1_child_1 at: http://9.134.212.179:5000/#/experiments/0/runs/4bacf2b574914f37b3b4150a01617f10.
2025/06/26 15:54:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://9.134.212.179:5000/#/experiments/0.
2025/06/26 15:54:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run test_ts_1_child_2 at: http://9.134.212.179:5000/#/experiments/0/runs/af25e7919f5c4f428809ea8174d543e9.
2025/06/26 15:54:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://9.134.212.179:5000/#/experiments/0.
2025/06/26 15:54:32 INFO mlflow.tracking._tracking_service.cl

[flaml.automl.logger: 06-26 15:54:32] {49} INFO - logging best model sarimax
[flaml.automl.logger: 06-26 15:54:33] {110} INFO - selected model: None
[flaml.automl.logger: 06-26 15:54:37] {245} INFO - retrain sarimax for 4.7s
[flaml.automl.logger: 06-26 15:54:37] {248} INFO - retrained model: <statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper object at 0x7fa0e42d6490>
[flaml.automl.logger: 06-26 15:54:37] {250} INFO - Best MLflow run name: test_ts_1_child_6
[flaml.automl.logger: 06-26 15:54:37] {251} INFO - Best MLflow run id: e0ffe4a6559d4fe9b8435aaa942df1dc




[flaml.automl.logger: 06-26 15:54:45] {1985} INFO - fit succeeded
[flaml.automl.logger: 06-26 15:54:45] {1986} INFO - Time taken to find the best model: 17.664713859558105


2025/06/26 15:54:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run test_ts_1 at: http://9.134.212.179:5000/#/experiments/0/runs/685c0895c545438691792838aba0796e.
2025/06/26 15:54:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://9.134.212.179:5000/#/experiments/0.


In [None]:
import numpy as np
from wedata.ts_automl.flaml import WeDataTimeSeriesAutoML
import mlflow
import flaml
from flaml import AutoML
mlflow.set_tracking_uri("http://9.134.212.179:5000")
# 构造时序数据
X_train = np.arange("2014-01", "2022-01", dtype="datetime64[M]")
y_train = np.random.random(size=84)
# WeDataTimeSeriesAutoML 训练设置：最大训练时间为 60s，评价指标为 acc，每个 Executor 的并行度为 2，开启强制取消（开启后，超过最大训练时间后将立即停止）
automl_settings = {
    "time_budget": 10,
    "metric": 'accuracy',
    "n_concurrent_trials": 1,
    "use_spark": True,
    "force_cancel": False,  # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.
}
