In [1]:
import numpy as np
import pandas as pd
import math
import scipy
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

In [2]:
# 导入数据，设置列名
def readDatasets(train_path, validation_path, test_path):
    prsa_train = pd.read_csv(train_path)
    prsa_val = pd.read_csv(validation_path)
    prsa_test = pd.read_csv(test_path)
    return prsa_train, prsa_val, prsa_test

In [3]:
prsa_train, prsa_val, prsa_test = readDatasets('PRSA_train.data.csv', 'PRSA_validation.data.csv', 'PRSA_test.data.csv')
# 删除第0列（id无法提供有效信息）
prsa_train, prsa_val, prsa_test = prsa_train.drop('No', axis=1), prsa_val.drop('No', axis=1), prsa_test.drop('No', axis=1)
# 构造完整数据集
prsa_full = pd.concat([prsa_train, prsa_val, prsa_test])
prsa_full = prsa_full.reset_index(drop=True)
# 把pm2.5列换到最前面，方便使用
prsa_full_pm25 = prsa_full['pm2.5']
prsa_full = prsa_full.drop('pm2.5', axis=1)
prsa_full.insert(0, 'pm2.5', prsa_full_pm25)

### 一、数据预处理+特征工程

#### 0. 缺失值处理+属性值编码 
把三个数据集合并，统一进行缺失值处理和离散属性编码。

In [4]:
# 用每个月的平均值填充月份为该月的pm2.5缺失值
for i in range(1,13):
    temp_pm25_mean = round(prsa_full[prsa_full['month'] == i]['pm2.5'].mean())
    temp_idx = prsa_full[(prsa_full['month'] == i) & (prsa_full['pm2.5'].isnull())].index.tolist()
    prsa_full.iloc[temp_idx, 0] = prsa_full.iloc[temp_idx, 0].fillna(temp_pm25_mean)

In [5]:
labelencoder = LabelEncoder()
onehotencoder = OneHotEncoder()
prsa_full = prsa_full.join(pd.get_dummies(prsa_full['cbwd']))
prsa_full = prsa_full.drop('cbwd', axis=1)
prsa_full['year'] = labelencoder.fit_transform(prsa_full['year'])
prsa_full['month'] = labelencoder.fit_transform(prsa_full['month'])
prsa_full['day'] = labelencoder.fit_transform(prsa_full['day'])
prsa_full

Unnamed: 0,pm2.5,year,month,day,hour,DEWP,TEMP,PRES,Iws,Is,Ir,NE,NW,SE,cv
0,129.0,2,9,6,4,9,10.0,1024.0,0.89,0,0,0,1,0,0
1,13.0,4,9,11,20,-1,13.0,1030.0,29.96,0,0,0,1,0,0
2,16.0,3,5,16,13,16,29.0,1001.0,1.79,0,0,0,0,1,0
3,42.0,4,6,13,18,15,35.0,1001.0,19.67,0,0,0,0,1,0
4,32.0,1,9,2,3,2,4.0,1030.0,7.60,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43819,92.0,3,6,21,3,20,22.0,1005.0,28.14,0,0,0,0,1,0
43820,78.0,3,5,12,17,12,30.0,1006.0,29.95,0,0,0,0,1,0
43821,472.0,4,9,24,20,14,15.0,1012.0,2.68,0,0,0,0,1,0
43822,80.0,2,7,17,17,24,25.0,1006.0,1.79,0,3,1,0,0,0


##### 1. 根据特征的实际意义进行深层语义的特征构建

In [6]:
# 可构造季度等属性

##### 2. 聚类属性

In [7]:
from sklearn.cluster import KMeans
# 选择参与聚类的特征

Unnamed: 0,pm2.5,year,quarter,month,day,hour,DEWP,TEMP,PRES,Iws,...,Centroid_10,Centroid_11,Centroid_12,Centroid_13,Centroid_14,Centroid_15,Centroid_16,Centroid_17,Centroid_18,Centroid_19
0,129.0,2,4,9,6,4,9,10.0,1024.0,0.89,...,6.894765,5.021142,7.248184,3.891536,1.765283,0.939860,6.443485,3.862717,10.550193,7.609183
1,13.0,4,4,9,11,20,-1,13.0,1030.0,29.96,...,6.149272,5.393884,6.936942,4.701253,3.278251,1.102056,6.470777,2.999295,10.116238,7.964671
2,16.0,3,2,5,16,13,16,29.0,1001.0,1.79,...,9.552457,1.043876,4.462437,2.540307,5.495345,5.158206,3.110012,7.526560,8.223385,4.463578
3,42.0,4,3,6,13,18,15,35.0,1001.0,19.67,...,8.728604,1.810770,4.948032,1.833611,4.545866,4.227993,4.073278,6.651041,8.933423,5.471602
4,32.0,1,4,9,2,3,2,4.0,1030.0,7.60,...,6.526168,5.487226,7.197370,4.778386,3.129711,1.164639,6.507107,3.299650,10.265977,7.893892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43819,92.0,3,3,6,21,3,20,22.0,1005.0,28.14,...,8.335571,1.498667,4.684974,1.163350,3.891469,3.811470,3.736697,6.265915,8.527854,4.962624
43820,78.0,3,2,5,12,17,12,30.0,1006.0,29.95,...,9.035882,0.680230,3.878134,2.173661,5.083858,4.953892,2.775752,7.167398,7.687315,4.026894
43821,472.0,4,4,9,24,20,14,15.0,1012.0,2.68,...,8.611846,6.227063,8.644464,4.544015,2.327125,4.710755,7.841042,6.535422,11.836998,8.058176
43822,80.0,2,3,7,17,17,24,25.0,1006.0,1.79,...,8.354700,2.318072,5.732665,1.365322,3.356335,3.210579,4.697079,5.990953,9.564594,5.918270


#### 3. 互信息

#### 4. scatterplot

#### 5. 对数变换纠正偏度

### 二、构造属性集、标签向量

### 三、多模型调参

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
# 尽量多用几个模型

In [14]:
# 定义交叉验证模式
kf = KFold(n_splits=5, random_state=0, shuffle=True)

# 定义交叉验证误差
def cv_mse(model):
    return -1 * cross_val_score(model, pd.concat([X_train, X_val]), pd.concat([y_train, y_val]), scoring='neg_mean_squared_error', cv=kf)

# 定义svr的交叉验证误差
def cv_mse_svr(model):
    return -1 * cross_val_score(model, pd.concat([X_train_scaled, X_val_scaled]), pd.concat([y_train, y_val]), scoring='neg_mean_squared_error', cv=kf)

# 定义均方根误差计算函数
def calc_mse(y_pred, y_true):
    return mean_squared_error(y_pred, y_true)

# 实例化基模型

### 四、多模型融合

In [None]:
# models = [xxx, xxx, xxx, xxx, xxx]

# # 计算各模型的预测结果
# def calc_preds(models, X_val):


# # 计算blending结果
# def calc_blending_pred(preds, weights):

# # 衡量验证集的blending误差
# weights = [0.05, 0.8, 0.05, 0.05, 0.05]
# val_preds = calc_preds(models, X_val)
# val_blending_mse = calc_mse(y_val, calc_blending_pred(val_preds, weights))
# print(val_blending_mse)

求验证集MSE

In [15]:
# model_rf.fit(X_train, y_train)
# 保存模型
# joblib.dump(model_rf, "model_rf1.pth")
model_rf = joblib.load("model_rf.pth")
rf_val_pred = model_rf.predict(X_val)
rf_val_mse = calc_mse(y_val, rf_val_pred)
print(rf_val_mse)   # 0.02222 n=1000



0.02222408034833288


#### 五、预测测试集

In [16]:
model_rf = joblib.load("model_rf.pth")
rf_test_pred = model_rf.predict(X_test)
rf_test_mse = calc_mse(y_test, rf_test_pred)
print(rf_test_mse)  # 0.02428 n=1000



0.024289418756141927


In [17]:
# 输出还原数据尺度的MSE
print('最终MSE：', calc_mse((np.exp(y_test) - 1), np.exp(rf_test_pred) - 1))
final_pred = np.exp(rf_test_pred) - 1
test_output = pd.DataFrame({'pm2.5': final_pred})
test_output.to_csv('pred_result.csv', index=False)

最终MSE： 264.73532727131493
