In [1]:
import pandas as pd
import numpy as np
import os 
import time

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

def data_generate(data_pm25_filled):
    data = []
    label = []
    data_pm25_filled_array = data_pm25_filled.iloc[:,3:].values
    for i in range(data_pm25_filled_array.shape[0] -12 -6):
        data_now = data_pm25_filled_array[i:i+12,:]
        label_now = data_pm25_filled_array[i+12:i+18,:]
        data.append(data_now)
        label.append(label_now)
    data = np.array(data)
    label = np.array(label)
    return data, label

def Evaluation(label, predict):
    MAE = np.mean(np.abs(label - predict))
    RMSE = np.power(np.mean(np.power(label - predict,2)) ,0.5)

    label_grade = label
    label_grade[label_grade < 35] = 1
    label_grade[label_grade > 250] = 6
    label_grade[label_grade > 150] = 5
    label_grade[label_grade > 115] = 4
    label_grade[label_grade > 75] = 3
    label_grade[label_grade > 35] = 2
    
    predict_grade = predict
    predict_grade[predict_grade < 35] = 1
    predict_grade[predict_grade > 250] = 6
    predict_grade[predict_grade > 150] = 5
    predict_grade[predict_grade > 115] = 4
    predict_grade[predict_grade > 75] = 3
    predict_grade[predict_grade > 35] = 2
    
    res = np.zeros(label_grade.shape)
    res[label_grade == predict_grade] = 1
    num_cor = res.sum()
    num_all = res.shape[0] * res.shape[1]
    prec = num_cor/num_all
    return MAE,RMSE,prec

# 读取数据
# 注: 20141231读取不了, 删了; 20151230,1231是空文件, 删了

# data_folder = os.walk(r"data")  
# for path,dir_list,file_list in data_folder:  
#     for file_name in file_list:  
#         path_now = os.path.join(path, file_name)
#         if file_name == "beijing_all_20150101.csv" :
#             data_all = pd.read_csv(path_now)
#             print(path_now)
#         elif file_name[:11] == "beijing_all" :
#             data_now = pd.read_csv(path_now)
#             data_all = pd.concat([data_all, data_now], axis=0)
#             print(path_now)
# data_all.to_csv("data_all.csv")
         



In [4]:
data_all = pd.read_csv("data_all.csv")
# 选取pm2.5数据
data_all_pm25 = data_all[data_all['type']=='PM2.5']
# 删去无用的前三列
data_all_pm25 = data_all_pm25.iloc[:,3:]
# "就近"填充缺失数据
data_all_pm25_filled = data_all_pm25.fillna(method='ffill')
data_all_pm25_filled = data_all_pm25_filled.fillna(method='bfill')
# 划分数据集
data_train_pm25_filled = data_all_pm25_filled[data_all_pm25_filled['date'] < 20200000]
data_train_pm25_filled = data_train_pm25_filled[data_train_pm25_filled['date'] > 20150000]
data_val_pm25_filled = data_all_pm25_filled[data_all_pm25_filled['date'] > 20200000]
data_test_pm25_filled = data_all_pm25_filled[data_all_pm25_filled['date'] < 20150000]
# 制作时序数据集
train_data,train_label = data_generate(data_train_pm25_filled)
val_data,val_label = data_generate(data_val_pm25_filled)
test_data,test_label = data_generate(data_test_pm25_filled)



In [5]:
data_all_pm25 

Unnamed: 0,date,hour,type,万寿西宫,万柳,东四,东四环,东高村,丰台花园,云岗,...,昌平,植物园,榆垡,永乐店,永定门内,琉璃河,西直门北,通州,门头沟,顺义
0,20140101.0,0.0,PM2.5,66.0,57.0,35.0,45.0,21.0,80.0,67.0,...,34.0,17.0,90.0,110.0,66.0,136.0,35.0,62.0,,24.0
5,20140101.0,1.0,PM2.5,72.0,68.0,66.0,46.0,14.0,83.0,77.0,...,26.0,25.0,122.0,116.0,68.0,181.0,56.0,79.0,,22.0
10,20140101.0,2.0,PM2.5,80.0,81.0,70.0,49.0,23.0,94.0,50.0,...,26.0,15.0,135.0,133.0,76.0,235.0,59.0,90.0,,21.0
15,20140101.0,3.0,PM2.5,89.0,95.0,71.0,57.0,34.0,106.0,40.0,...,26.0,13.0,146.0,159.0,89.0,224.0,67.0,100.0,,18.0
20,20140101.0,4.0,PM2.5,92.0,95.0,82.0,86.0,51.0,111.0,39.0,...,28.0,24.0,145.0,138.0,102.0,172.0,61.0,104.0,,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270651,20200509.0,19.0,PM2.5,34.0,33.0,37.0,26.0,29.0,34.0,17.0,...,24.0,,28.0,27.0,31.0,35.0,28.0,32.0,38.0,23.0
270656,20200509.0,20.0,PM2.5,40.0,37.0,38.0,32.0,28.0,45.0,18.0,...,27.0,,32.0,36.0,35.0,48.0,44.0,37.0,45.0,26.0
270661,20200509.0,21.0,PM2.5,46.0,40.0,42.0,40.0,31.0,48.0,19.0,...,31.0,,34.0,44.0,37.0,44.0,49.0,43.0,46.0,29.0
270666,20200509.0,22.0,PM2.5,47.0,42.0,45.0,44.0,42.0,44.0,18.0,...,32.0,,30.0,49.0,34.0,38.0,54.0,47.0,40.0,31.0


In [6]:
# 训练并测试模型
print('training')
models = [DecisionTreeRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor()]
# models = [AdaBoostRegressor(),GradientBoostingRegressor()]
train_predict_all = []
val_predict_all = []
test_predict_all = []

for model_now in models:
    print(model_now)
    start = time.time()
    train_predict = np.zeros(train_label.shape)
    val_predict = np.zeros(val_label.shape)
    test_predict = np.zeros(test_label.shape)
    
    for ti in range(train_label.shape[1]):
        for lo in range(train_label.shape[2]):
            model_now.fit(train_data[:,:,lo], train_label[:,ti,lo])
            train_predict[:,ti,lo] = model_now.predict(train_data[:,:,lo])
            val_predict[:,ti,lo] = model_now.predict(val_data[:,:,lo])
            test_predict[:,ti,lo] = model_now.predict(test_data[:,:,lo])
            
    train_predict_all.append(train_predict)
    val_predict_all.append(val_predict)
    test_predict_all.append(test_predict)
    end = time.time()
    print('training lasted: '+str(end-start))

training
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
training lasted: 144.28638434410095
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators='warn',
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)




training lasted: 838.9243681430817
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None)
training lasted: 692.5034489631653
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
training lasted: 540.5078032016754


In [7]:
# 评价模型性能

for m in range(4):
    print(models[m])
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(val_label[:,i,:], val_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
time:1 MAE = 9.428295380280987 RMSE = 18.432945397717187 PREC = 0.8084952255515311
time:2 MAE = 14.800220479338345 RMSE = 27.30758642848058 PREC = 0.7220565407592079
time:3 MAE = 19.009155667917724 RMSE = 33.778635991392306 PREC = 0.6596735500258714
time:4 MAE = 22.423136377046102 RMSE = 38.79468007990371 PREC = 0.6169622277623594
time:5 MAE = 25.53921982362203 RMSE = 43.55595816855958 PREC = 0.5819935086316383
time:6 MAE = 28.14038534355595 RMSE = 47.4660112630674 PREC = 0.5559621807234583
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
            

In [8]:
for m in range(4):
    print(models[m])
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(test_label[:,i,:], test_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
time:1 MAE = 43.80619442286574 RMSE = 3611.4825785582348 PREC = 0.7211174274277642
time:2 MAE = 54.322006205179775 RMSE = 3612.348403952943 PREC = 0.6041609203134986
time:3 MAE = 61.987770334859654 RMSE = 3612.7036031729394 PREC = 0.5320293316290491
time:4 MAE = 68.05765015379619 RMSE = 3613.3749716481775 PREC = 0.48268021124154864
time:5 MAE = 73.29335461585367 RMSE = 3613.2720987642742 PREC = 0.44393016919506206
time:6 MAE = 77.42819478983756 RMSE = 3614.2173727102 PREC = 0.4168118672003767
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
          

In [11]:
models[0].get_depth()

50

In [None]:
# 调整超参数

# 训练并测试模型
print('training')
# models = [DecisionTreeRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor()]
models = [DecisionTreeRegressor()]
train_predict_all = []
val_predict_all = []
test_predict_all = []

for model_now in models:
    print(model_now)
    start = time.time()
    train_predict = np.zeros(train_label.shape)
    val_predict = np.zeros(val_label.shape)
    test_predict = np.zeros(test_label.shape)
    
    for ti in range(train_label.shape[1]):
        for lo in range(train_label.shape[2]):
            model_now.fit(train_data[:,:,lo], train_label[:,ti,lo])
            train_predict[:,ti,lo] = model_now.predict(train_data[:,:,lo])
            val_predict[:,ti,lo] = model_now.predict(val_data[:,:,lo])
            test_predict[:,ti,lo] = model_now.predict(test_data[:,:,lo])
            
    train_predict_all.append(train_predict)
    val_predict_all.append(val_predict)
    test_predict_all.append(test_predict)
    end = time.time()
    print('training lasted: '+str(end-start))