In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!apt-get install graphviz -y
!pip install pydot

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
graphviz is already the newest version (2.42.2-6ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.


In [3]:
import pandas as pd
import numpy as np
import datetime
# 读取数据
features = pd.read_csv("/content/drive/MyDrive/529/train_data.csv")
# 假设 features 已经读取了数据
# 将 startdate 转换为 datetime 类型
# 尝试先统一格式
def fix_date_format(x):
    try:
        # 如果是 YYYY/MM/DD 格式
        if x.count('/') == 2 and len(x.split('/')[0]) == 4:
            dt = pd.to_datetime(x, format='%Y/%m/%d')
        else:
            dt = pd.to_datetime(x, format='%m/%d/%y')
        return dt
    except:
        return pd.NaT

features['startdate'] = features['startdate'].apply(fix_date_format)
# 拆分为日、月、年三个新列
features['day'] = features['startdate'].dt.day
features['month'] = features['startdate'].dt.month
features['year'] = features['startdate'].dt.year
#丢掉startdate为了排除掉TimeStamp
features = features.drop(['startdate', 'index'], axis=1)
# 查看结果
print(features[['day', 'month', 'year']].head(5))

   day  month  year
0    1      9  2014
1    2      9  2014
2    3      9  2014
3    4      9  2014
4    5      9  2014


In [None]:
print('数据维度：',features.shape)

In [None]:
features.describe()

In [4]:
years = features['year']
months = features['month']
days = features['day']

#datetime格式
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year,month,day in zip(years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]

In [None]:
dates[:5]

In [None]:
# 画图
import matplotlib.pyplot as plt

%matplotlib inline

# 默认风格
plt.style.use('fivethirtyeight')

In [None]:
fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(30, 30))
# 标签值
ax1.plot(dates, features['contest-tmp2m-14d__tmp2m'])
ax1.set_xlabel('')
ax1.set_ylabel('Temperature')
ax1.set_title('Mean Temp')
plt.tight_layout(pad=2)
plt.show()

In [5]:
#独热编码
features = pd.get_dummies(features,dtype = int)
features.head(5)

Unnamed: 0,lat,lon,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,nmme0-tmp2m-34w__gfdlflorb0,...,climateregions__climateregion_Cfb,climateregions__climateregion_Csa,climateregions__climateregion_Csb,climateregions__climateregion_Dfa,climateregions__climateregion_Dfb,climateregions__climateregion_Dfc,climateregions__climateregion_Dsb,climateregions__climateregion_Dsc,climateregions__climateregion_Dwa,climateregions__climateregion_Dwb
0,0.0,0.833333,237.0,29.02,31.64,29.57,30.73,29.71,31.52,31.68,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.833333,228.9,29.02,31.64,29.57,30.73,29.71,31.52,31.68,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.833333,220.69,29.02,31.64,29.57,30.73,29.71,31.52,31.68,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.833333,225.28,29.02,31.64,29.57,30.73,29.71,31.52,31.68,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.833333,237.24,29.02,31.64,29.57,30.73,29.71,31.52,31.68,...,0,0,0,0,0,0,0,0,0,0


In [None]:
print('Shape of features after one-hot encoding',features.shape)

In [6]:
import numpy as np
#标签
label = np.array(features['contest-tmp2m-14d__tmp2m'])
#在特征中去除标签
features = features.drop('contest-tmp2m-14d__tmp2m',axis = 1)
#名字单独保存下，已备后患
feature_list = list(features.columns)
#转为合适的格式
features = np.array(features)

In [7]:
#数据集切分
from sklearn.model_selection import train_test_split

train_features,test_features,train_labels,test_labels = train_test_split(features,label,test_size = 0.25,random_state = 42)

In [None]:
print('训练集特征：',train_features.shape)
print('训练集标签：',train_labels.shape)
print('测试集特征：',test_features.shape)
print('测试集标签：',test_labels.shape)
#print('测试集标签值',test_labels)

In [None]:
#导入算法
from sklearn.ensemble import RandomForestRegressor
#建模
rf = RandomForestRegressor(  n_estimators=300, # 降低树数量，节省内存
                max_depth=10, # 控制树深度，避免爆内存
                random_state=42, # 保证结果可复现
                n_jobs=-1 # 使用所有CPU核心
              )
#训练
rf.fit(train_features,train_labels)

In [None]:
# 预测结果
predictions = rf.predict(test_features)

# 计算 SMAPE
smape = np.mean(2.0 * np.abs(predictions - test_labels) /
                (np.abs(test_labels) + np.abs(predictions))) * 100

print("SMAPE:", smape)

In [None]:
import os
import pydot
from sklearn.tree import export_graphviz
from IPython.display import Image

# 导出为 .dot 文件
tree = rf.estimators_[5]
export_graphviz(tree, out_file='tree.dot',
                feature_names = feature_list, rounded = True, precision = 1)

# 读取并生成图像
(graph,) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')

# 显示图像
Image(filename='tree.png')


In [None]:
print('The depth of this tree',tree.tree_.max_depth)

In [None]:
#得到特征重要性
importances = list(rf.feature_importances_)
#转换格式
feature_importances = [(feature,round(importance,3)) for feature, importance in zip(feature_list, importances)]
#排序
feature_importances = sorted(feature_importances, key = lambda x:x[1],reverse = True)
#打印
[print('Variable:{:20} Importance:{}'.format(*pair)) for pair in feature_importances]

In [None]:
# 获取前20个变量的索引
top_n = 20
indices = np.argsort(importances)[-top_n:][::-1]

# 筛选变量和重要性
x_values = list(range(top_n))
top_features = [feature_list[i] for i in indices]
top_importances = [importances[i] for i in indices]

# 绘图
plt.figure(figsize=(10, 6))
plt.bar(x_values, top_importances)
plt.xticks(x_values, top_features, rotation=45, ha='right')
plt.ylabel('Importances')
plt.xlabel('Variable')
plt.title('Top 20 Variable Importances')
plt.tight_layout()
plt.show()

特征重要性累加看什么时候超过95%

In [None]:
# 对特征进行排序
sorted_importance = [importance[1] for importance in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]

# 只取前20
top_n = 20
sorted_importance_top = sorted_importance[:top_n]
sorted_features_top = sorted_features[:top_n]

# 累计重要性
cumulative_importances = np.cumsum(sorted_importance_top)

# 找到第一次达到95%的位置
threshold = 0.95
for i, cum_imp in enumerate(cumulative_importances):
    if cum_imp >= threshold:
        print(f"累计重要性在第 {i+1} 个特征 ({sorted_features_top[i]}) 达到 {cum_imp:.2f}")
        break

# 绘制折线图（只画前20）
x_values = list(range(top_n))
plt.figure(figsize=(10,6))
plt.plot(x_values, cumulative_importances, 'g-')
plt.hlines(y=threshold, xmin=0, xmax=top_n, color='r', linestyle='dashed')
plt.xticks(x_values, sorted_features_top, rotation='vertical')
plt.xlabel('Variable')
plt.ylabel('Cumulative Importance')
plt.title('Cumulative Importance (Top 20)')
plt.tight_layout()
plt.show()

In [8]:
# 选择重要性 > 0.01 的特征（约8-9个）
important_features_new = [
    'nmme-tmp2m-56w__gfdlflora',
    'nmme-tmp2m-34w__cfsv2',
    'nmme-tmp2m-34w__gfdlflora',
    'nmme-tmp2m-56w__cfsv2',
    'nmme-tmp2m-34w__gfdlflorb',
    'contest-pevpr-sfc-gauss-14d__pevpr',
    'contest-prwtr-eatm-14d__prwtr',
    'nmme-tmp2m-56w__nmmemean',
    'contest-wind-h500-14d__wind-hgt-500',
    'contest-wind-h100-14d__wind-hgt-100'
]
# 用 feature_list 找到这些特征的索引位置
indices = [feature_list.index(col) for col in important_features_new if col in feature_list]
# 在 NumPy 数组里用索引切片
selected_train_features = train_features[:, indices]
selected_test_features = test_features[:, indices]


减少特征值重新训练及计算偏差

In [None]:
#训练模型
#导入算法
from sklearn.ensemble import RandomForestRegressor
rf_reduced = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_reduced.fit(selected_train_features, train_labels)

In [None]:
# 预测结果
predictions = rf_reduced.predict(selected_test_features)

# 计算 SMAPE
smape = np.mean(2.0 * np.abs(predictions - test_labels) /
                (np.abs(test_labels) + np.abs(predictions))) * 100

print("SMAPE:", smape)

调参

In [9]:
#训练模型
#导入算法
from sklearn.ensemble import RandomForestRegressor
rf_reduced = RandomForestRegressor(
    n_estimators = 300,
    max_depth = 40,
    random_state = 42,
    max_features = 'log2',
    n_jobs = -1

)

rf_reduced.fit(selected_train_features, train_labels)

In [10]:

from sklearn.metrics import r2_score
# 预测结果
predictions = rf_reduced.predict(selected_test_features)

# 计算 SMAPE
smape = np.mean(2.0 * np.abs(predictions - test_labels) /
                (np.abs(test_labels) + np.abs(predictions))) * 100

print("SMAPE:", smape)
# 计算 R²
r2 = r2_score(test_labels, predictions)
print("R²:", r2)
# 计算容差准确率
delta = 1.0 # 容差阈值，例如 1℃
tolerance_accuracy = np.mean(np.abs(predictions - test_labels) <= delta) * 100
print("Tolerance-based Accuracy (%):", tolerance_accuracy)

SMAPE: 13.724208949808597
R²: 0.9920606460394582
Tolerance-based Accuracy (%): 81.81914961568762


限定范围随机调参配置寻找最佳参数

In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
# 定义 SMAPE 函数
def smape(y_true, y_pred):
    return np.mean(2.0 * np.abs(y_pred - y_true) /
                   (np.abs(y_true) + np.abs(y_pred))) * 100

# 包装成 scorer（注意 greater_is_better=False，因为 SMAPE 越小越好）
smape_scorer = make_scorer(smape, greater_is_better=False)

# 参数字典
param_dist = {
    'n_estimators': [300],
    'max_features': ['sqrt','log2'],
    'max_depth': [25, 28, 31, 34, 37, 40, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 5, 10],
    'bootstrap': [True, False]
}

# 随机搜索
rf = RandomForestRegressor()
random_grid = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,        # 随机抽取 50 组组合
    cv=3,             # 3 折交叉验证
    scoring=smape_scorer,  # 使用自定义 SMAPE
    n_jobs=-1,        # 并行加速
    verbose=2,
    random_state=42
)

random_grid.fit(selected_train_features, train_labels)

print("Best Params:", random_grid.best_params_)
print("Best SMAPE:", -random_grid.best_score_)  # 注意取负号，因为 greater_is_better=False


Fitting 3 folds for each of 50 candidates, totalling 150 fits


51 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skle

Best Params: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40, 'bootstrap': False}
Best SMAPE: 14.243637877995889
