In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
# 加载CSV数据
data = pd.read_csv("./new_data.csv", header=None)
data

Unnamed: 0,0,1,2,3,4,5,6
0,date(日期),Q(径流),E(蒸发),P(降水),Year,Month,Day
1,1983/1/1,6.5,0.9,0.3,1983,1,1
2,1983/1/2,6.99,0,0.2,1983,1,2
3,1983/1/3,6.99,2.2,2.6,1983,1,3
4,1983/1/4,7.49,0.1,27.7,1983,1,4
...,...,...,...,...,...,...,...
1822,1987/12/27,4.52,0.1,2.4,1987,12,27
1823,1987/12/28,4.59,0.9,1.1,1987,12,28
1824,1987/12/29,1.38,1,1,1987,12,29
1825,1987/12/30,6.84,1.1,0,1987,12,30


In [3]:
data = data[1:]

In [4]:
# 将数据拆分为特征和标签
X = data.iloc[:, [2, 3, 4, 5, 6]].values
y = data.iloc[:, 1].values

In [5]:
X_float = X.astype(np.float)
y_float = y.astype(np.float)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [6]:
# 最大最小归一化函数
def min_max_normalize(data):
# 通过求最大值和最小值，计算数据的取值范围
    min_val = min(data)
    max_val = max(data)
# 通过循环将数据缩放到 [0, 1] 的区间内
    normalized_data = []
    for val in data:
        norm_val = (val - min_val) / (max_val - min_val)
        normalized_data.append(norm_val)
# 返回归一化后的数据
    return normalized_data

In [7]:
for i in range(2):
    column = [row[i] for row in X_float]
    normalize = min_max_normalize(column)
    X_float[:, i] = normalize

In [8]:
# 将数据集拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_float, y_float, test_size=0.3, random_state=42)

In [9]:
# 定义基学习器
base_rf_lin = [RandomForestRegressor(n_estimators=100, random_state=0), LinearRegression()]

base_rf_mlp = [RandomForestRegressor(n_estimators=100, random_state=0), MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=1000)]

base_rf_svm = [RandomForestRegressor(n_estimators=100, random_state=0), SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)]

base_lin_mlp = [LinearRegression(), MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=1000)]

base_lin_svm = [LinearRegression(), SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)]

base_mlp_svm = [MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=1000), SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)]

In [10]:
base_models = [base_rf_lin, base_rf_mlp, base_rf_svm, base_lin_mlp, base_lin_svm, base_mlp_svm]

In [11]:
# 定义元学习器
meta_lin = LinearRegression()
meta_rf = RandomForestRegressor(n_estimators=100, random_state=0)
meta_svm = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
meta_mlp = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=1000)

In [12]:
meta_models = [meta_lin, meta_rf, meta_svm, meta_mlp]

In [13]:
# 相对误差绝对值的平均值（Mean Absolute Percentage Error，MAPE）
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [14]:
# 平均绝对误差（MAE）
def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

In [15]:
# 相对均方根误差（Root Mean Square Percentage Error，RMSPE）
def root_mean_square_percentage_error(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

In [16]:
res_nse# 合格率（Accuracy）
def accuracy(y_true, y_pred, tolerance=0.1):
    errors = np.abs((y_true - y_pred) / y_true)
    return np.sum(errors <= tolerance) / len(errors)

In [17]:
# Nash效率系数（Nash-Sutcliffe efficiency coefficient，NSE）
def nash_sutcliffe_efficiency(y_true, y_pred):
    numerator = np.sum((y_true - y_pred) ** 2)
    denominator = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - numerator / denominator

#### Evaluate

In [18]:
res_mae = []
res_nse = []

In [19]:
for meta_model in meta_models:
    for base_model in base_models:
        # 定义k-fold交叉验证的参数
        kf = KFold(n_splits=5, shuffle=True, random_state=0)

        # 定义元特征数组和元标签数组
        meta_features = np.zeros((X.shape[0], len(base_model)))
        meta_labels = np.zeros(X.shape[0])

        # 对每个fold进行训练和预测
        for fold, (train_index, test_index) in enumerate(kf.split(X)):
            print(f'Processing fold {fold+1}')

            # 分割训练和测试集
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # 对每个基学习器进行训练和预测
            for i, model in enumerate(base_model):
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                meta_features[test_index, i] = y_pred

            # 对元学习器进行训练
            meta_model.fit(meta_features[train_index], y_train)

            # 对元学习器进行预测
            meta_labels[test_index] = meta_model.predict(meta_features[test_index])

        # 计算集成预测的MAE和R2
        ensemble_mae = mean_absolute_error(y_float, meta_labels)
        ensemble_r2 = np.corrcoef(y_float, meta_labels)[0, 1]**2

        print("=====================================================")
        print("Base model:{}, Meta model:{}".format(base_model, meta_model))
        print(f'Ensemble MAE: {ensemble_mae:.2f}')
        print(f'Ensemble R2: {ensemble_r2:.2f}')


        mape = mean_absolute_percentage_error(y_float, meta_labels)
        mae = mean_absolute_error(y_float, meta_labels)
        nse = nash_sutcliffe_efficiency(y_float, meta_labels)
        rmspe = root_mean_square_percentage_error(y_float, meta_labels)
        acc = accuracy(y_float, meta_labels)
        res_mae.append(mae)
        res_nse.append(nse)
        print('MAPE:', mape)
        print('MAE:', mae)
        print('NSE:', nse)
        print('RMSPE:', rmspe)
        print('Accuracy:', acc)
        print("=====================================================")

Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5
Base model:[RandomForestRegressor(random_state=0), LinearRegression()], Meta model:LinearRegression()
Ensemble MAE: 13.83
Ensemble R2: 0.24
MAPE: 167.8920870462724
MAE: 13.834684862764496
NSE: 0.22627912164290964
RMSPE: 3.67961337106505
Accuracy: 0.0755750273822563
Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5
Base model:[RandomForestRegressor(random_state=0), MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000)], Meta model:LinearRegression()
Ensemble MAE: 13.52
Ensemble R2: 0.24
MAPE: 157.35345248859113
MAE: 13.522335558838082
NSE: 0.22511475332761277
RMSPE: 3.521172071960451
Accuracy: 0.07612267250821468
Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5
Base model:[RandomForestRegressor(random_state=0), SVR(C=100, gamma=0.1)], Meta model:LinearRegression()
Ensemble MAE: 14.56
Ensemble R2: 0.25
MAPE: 197.345

In [28]:
def find_minimum_value(arr):
    min_value = min(arr, key=abs)  # 找到列表中的最小值
    min_index = arr.index(min_value)  # 找到最小值在列表中的索引
    return min_value, min_index

In [29]:
res_mae

[13.834684862764496,
 13.522335558838082,
 14.561498737060715,
 16.768789781636393,
 15.223885122706113,
 15.215221671712868,
 12.728002109487045,
 13.073311755273142,
 12.507667197252523,
 16.34983484941619,
 13.923838413843864,
 14.234507753304628,
 11.420088190102051,
 11.712070588183602,
 11.806236377575953,
 15.40959198487946,
 13.114572332534612,
 13.347681692946074,
 11.531170034000375,
 13.34397576343769,
 13.378468958864506,
 16.470285176357876,
 13.210223196612903,
 14.619219190762948]

In [30]:
# 调用函数查找最小值和索引
min_value, min_index = find_minimum_value(res_mae)
# 输出结果
print("最小值:", min_value)
print("最小值的索引:", min_index)

最小值: 11.420088190102051
最小值的索引: 12


In [31]:
res_nse

[0.22627912164290964,
 0.22511475332761277,
 0.23404748302500322,
 0.02377301420636424,
 0.16944134979275283,
 0.1676063087063372,
 0.1358831694842264,
 0.10994269034952153,
 0.16246750559134648,
 -0.0653244577901182,
 0.08719919531846143,
 0.0475402131902245,
 0.17823789473764984,
 0.1688218043190216,
 0.1590623371063291,
 -0.012219093329540742,
 0.0658984529741391,
 0.04284769072517469,
 0.32715473244480264,
 0.2864697535761289,
 0.30455184234942656,
 0.10221141847728787,
 0.26544938110847427,
 0.23983035331185176]

In [32]:
# 调用函数查找最小值和索引
min_value, min_index = find_minimum_value(res_nse)
# 输出结果
print("最小值:", min_value)
print("最小值的索引:", min_index)

最小值: -0.012219093329540742
最小值的索引: 15
