In [1]:
import pandas as pd
import numpy as np
import os 
import time

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from multiprocessing import Pool

def data_generate(data_type_filled):
    time_in = 12
    time_out = 6
    num_sample = data_type_filled['PM2.5'].shape[0] -time_in -time_out
    num_loc = data_type_filled['PM2.5'].shape[1] -3
    num_feature = len(data_type_filled)
    data = []
    label = []
    data_filled_array = {}
    for type_now in data_type_filled:
        data_filled_array[type_now] = data_type_filled[type_now].iloc[:,3:].values
        
    start = time.time()
    for i in range(0,num_sample):
        data_now = np.zeros([time_in, num_feature, num_loc])
        label_now = np.zeros([time_out,num_loc])
        feature_now = -1
        for type_now in data_type_filled:
            feature_now = feature_now +1
            if data_type_filled[type_now].iloc[i+time_in+time_out-1,0] < 20170708 :
                data_now[:,feature_now,:] = data_filled_array[type_now][i:i+time_in,:]
                if type_now == 'PM2.5':
                    label_now = data_filled_array[type_now][i+time_in:i+time_in+time_out,:]
            elif data_type_filled[type_now].iloc[i,0] > 20170708 :
                data_now[:,feature_now,:] = data_filled_array[type_now][i:i+time_in,:]
                if type_now == 'PM2.5':
                    label_now = data_filled_array[type_now][i+time_in:i+time_in+time_out,:]
        
        data_now = data_now.reshape((time_in*num_feature,num_loc))
        data.append(data_now)
        label.append(label_now)
        
    end = time.time()
#     print('data generating lasted: '+str(end-start))
                
    data = np.array(data)
    label = np.array(label)
    return data, label

def Evaluation(label, predict):
    MAE = np.mean(np.abs(label - predict))
    RMSE = np.power(np.mean(np.power(label - predict,2)) ,0.5)

    label_grade = label.copy()
    label_grade[label_grade < 35] = 1
    label_grade[label_grade > 250] = 6
    label_grade[label_grade > 150] = 5
    label_grade[label_grade > 115] = 4
    label_grade[label_grade > 75] = 3
    label_grade[label_grade > 35] = 2
    
    predict_grade = predict.copy()
    predict_grade[predict_grade < 35] = 1
    predict_grade[predict_grade > 250] = 6
    predict_grade[predict_grade > 150] = 5
    predict_grade[predict_grade > 115] = 4
    predict_grade[predict_grade > 75] = 3
    predict_grade[predict_grade > 35] = 2
    
    res = np.zeros(label_grade.shape)
    res[label_grade == predict_grade] = 1
    num_cor = res.sum()
    num_all = res.shape[0] * res.shape[1]
    prec = num_cor/num_all
    return MAE,RMSE,prec

In [2]:
data_all = pd.read_csv("data_all.csv")
# data_extra = pd.read_csv("data_extra.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# 删去无用的前三列
data_all = data_all.iloc[:,3:]
# data_extra = data_extra.iloc[:,3:]

# 删去缺失了一半PM2.5数据的植物园
data_all = data_all.drop(columns=['植物园'])

# 删去20170702-20170708
data_all = data_all.drop(index= range(148393,148456))

# 选取PM2.5数据
data_all = data_all[data_all['type']=='PM2.5']

In [6]:
# 查看全体数据缺失情况
na_count = data_all.isnull().sum().sort_values(ascending=False)
na_rate = na_count / len(data_all)
na_data = pd.concat([na_count,na_rate],axis=1,keys=['count','ratio'])
na_data

Unnamed: 0,count,ratio
南三环,4894,0.090422
琉璃河,4727,0.087336
东四环,4265,0.078801
八达岭,4038,0.074606
榆垡,3692,0.068214
通州,3085,0.056999
前门,2988,0.055207
东高村,2701,0.049904
密云水库,2572,0.047521
西直门北,2474,0.04571


In [4]:
# 划分数据集
data_train = data_all[data_all['date'] < 20200000]
data_train = data_train[data_train['date'] > 20150000]
data_val = data_all[data_all['date'] > 20200000]
data_test = data_all[data_all['date'] < 20150000]

In [5]:
# data_all_type = {}
# for data_type in data_all['type'][0:5]:
#     data_all_type[data_type] = data_all[data_all['type']==data_type]
# for data_type in data_extra['type'][0:7]:
#     data_all_type[data_type] = data_extra[data_extra['type']==data_type]

data_train_type = {}
for data_type in data_train['type'][0:5]:
    data_train_type[data_type] = data_train[data_train['type']==data_type]
data_val_type = {}
for data_type in data_val['type'][0:5]:
    data_val_type[data_type] = data_val[data_val['type']==data_type]
data_test_type = {}
for data_type in data_test['type'][0:5]:
    data_test_type[data_type] = data_test[data_test['type']==data_type]

In [6]:
# for type_now in data_all_type:
#     data_all_type[type_now] = data_all_type[type_now].fillna(method='ffill')
#     data_all_type[type_now] = data_all_type[type_now].fillna(method='bfill')
for type_now in data_train_type:
    data_train_type[type_now] = data_train_type[type_now].fillna(method='ffill')
    data_train_type[type_now] = data_train_type[type_now].fillna(method='bfill')
for type_now in data_val_type:
    data_val_type[type_now] = data_val_type[type_now].fillna(method='ffill')
    data_val_type[type_now] = data_val_type[type_now].fillna(method='bfill')
for type_now in data_test_type:
    data_test_type[type_now] = data_test_type[type_now].fillna(method='ffill')
    data_test_type[type_now] = data_test_type[type_now].fillna(method='bfill')

In [9]:
# 查看数据缺失情况
print('train')
for type_now in data_train_type:
    df = data_train_type[type_now] 
    na_count = df.isnull().sum().sort_values(ascending=False)
    na_rate = na_count / len(df)
    na_data = pd.concat([na_count,na_rate],axis=1,keys=['count','ratio'])
    print(na_data['count'].max())
print('val')
for type_now in data_val_type:
    df = data_val_type[type_now] 
    na_count = df.isnull().sum().sort_values(ascending=False)
    na_rate = na_count / len(df)
    na_data = pd.concat([na_count,na_rate],axis=1,keys=['count','ratio'])
    print(na_data['count'].max())
print('test')
for type_now in data_test_type:
    df = data_test_type[type_now] 
    na_count = df.isnull().sum().sort_values(ascending=False)
    na_rate = na_count / len(df)
    na_data = pd.concat([na_count,na_rate],axis=1,keys=['count','ratio'])
    print(na_data['count'].max())

train
0
val
0
test
0


In [7]:
# 制作时序数据集
train_data,train_label = data_generate(data_train_type)
val_data,val_label = data_generate(data_val_type)
test_data,test_label = data_generate(data_test_type)

In [8]:
def para_pass(args):
    return model_fitting(*args)

def model_fitting(model,train_data,train_label,ti,lo):
    model.fit(train_data[:,:,lo], train_label[:,ti,lo])
    return model

In [9]:
# 训练并测试模型
print('training')
DT = DecisionTreeRegressor(max_depth = 50)
RF = RandomForestRegressor(n_estimators=10,max_depth = 50)
Ada_DT = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth=50),n_estimators=10)
GBDT = GradientBoostingRegressor(n_estimators=10,max_depth = 50)
models = [DT,RF,Ada_DT,GBDT]
train_predict_all = []
val_predict_all = []
test_predict_all = []

MultiNum = 16
pool = Pool(processes=MultiNum)

for model_now in models:
    print(model_now)
    start = time.time()
    train_predict = np.zeros(train_label.shape)
    val_predict = np.zeros(val_label.shape)
    test_predict = np.zeros(test_label.shape)
    
    num_time = train_label.shape[1]
    num_loc = train_label.shape[2]
    
    para_model = [model_now]*num_loc*num_time
    para_train_data = [train_data]*num_loc*num_time
    para_train_label = [train_label]*num_loc*num_time
    para_loc = [lo for lo in range(num_loc)]*num_time

# 时间和空间都并行
    para_ti = [ti for ti in range(num_time) for lo in range(num_loc)]
    para_list = list(zip(para_model, para_train_data,para_train_label,para_ti,para_loc))
    model_fitted_list = pool.map(para_pass,para_list)
    idex = 0
    for ti in range(num_time):
        for lo in range(num_loc):
            model_fitted = model_fitted_list[idex]
            train_predict[:,ti,lo] = model_fitted.predict(train_data[:,:,lo])
            val_predict[:,ti,lo] = model_fitted.predict(val_data[:,:,lo])
            test_predict[:,ti,lo] = model_fitted.predict(test_data[:,:,lo])
            idex = idex + 1

# 时间串行，空间并行
#     for ti in range(num_time):
#         para_ti = [ti]*num_loc
#         para_list = list(zip(para_model, para_train_data,para_train_label,para_ti,para_loc))
#         model_fitted_list = pool.map(para_pass,para_list)
#         for lo in range(num_loc):
#             model_fitted = model_fitted_list[lo]
#             train_predict[:,ti,lo] = model_fitted.predict(train_data[:,:,lo])
#             val_predict[:,ti,lo] = model_fitted.predict(val_data[:,:,lo])
#             test_predict[:,ti,lo] = model_fitted.predict(test_data[:,:,lo])
            
    train_predict_all.append(train_predict)
    val_predict_all.append(val_predict)
    test_predict_all.append(test_predict)
    end = time.time()
    print('training lasted: '+str(end-start))
        
pool.close()
pool.join()
            

training
DecisionTreeRegressor(criterion='mse', max_depth=50, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
training lasted: 31.027734756469727
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
training lasted: 152.98888301849365
AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse',
                               

In [10]:
# 模型平行性能
for m in range(len(models)):
    print(models[m])
    print('train error')
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(train_label[:,i,:], train_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))
    print('val error')
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(val_label[:,i,:], val_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))
    print('test error') 
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(test_label[:,i,:], test_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))      

DecisionTreeRegressor(criterion='mse', max_depth=50, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
train error
time:1 MAE = 10.093015646137198 RMSE = 23.204849844198197 PREC = 0.8205605098712185
time:2 MAE = 14.855439917638314 RMSE = 38.472430117093 PREC = 0.7589713522400897
time:3 MAE = 20.043438359743483 RMSE = 41.758311599233316 PREC = 0.6926564273891225
time:4 MAE = 22.24432696945207 RMSE = 50.36826597931876 PREC = 0.6770036630745766
time:5 MAE = 26.374350957108273 RMSE = 52.89623181458449 PREC = 0.6296021782087856
time:6 MAE = 27.26983185893459 RMSE = 58.57075113736681 PREC = 0.6309718555844256
val error
time:1 MAE = 9.955956370612606 RMSE = 20.479125569157222 PREC = 0.8001704468418912
time:2 MAE = 15.473110650340553 RMSE = 28.46

In [19]:
# 模型DT平行性能
for m in range(len(models)):
    print(models[m])
    print('train error')
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(train_label[:,i,:], train_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))
    print('val error')
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(val_label[:,i,:], val_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))
    print('test error') 
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(test_label[:,i,:], test_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))      

DecisionTreeRegressor(criterion='mse', max_depth=50, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
train error
time:1 MAE = 10.07142668707161 RMSE = 23.013029256331066 PREC = 0.8207554866606837
time:2 MAE = 14.87082011233463 RMSE = 38.61957068162249 PREC = 0.7589865631952962
time:3 MAE = 20.039040092577565 RMSE = 41.71598851841619 PREC = 0.6930823341349046
time:4 MAE = 22.278811572321437 RMSE = 50.48133204545503 PREC = 0.6763620373276841
time:5 MAE = 26.35926174703123 RMSE = 52.74363035123608 PREC = 0.6298752839954478
time:6 MAE = 27.23622187762292 RMSE = 58.38047471889756 PREC = 0.6308653788979801
val error
time:1 MAE = 9.988138732149833 RMSE = 20.49556287403834 PREC = 0.7999961262081389
time:2 MAE = 15.454234879498879 RMSE = 28.8042

In [27]:
# 模型RT性能
for m in range(len(models)):
    print(models[m])
    print('train error')
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(train_label[:,i,:], train_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))
    print('val error')
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(val_label[:,i,:], val_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))
    print('test error') 
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(test_label[:,i,:], test_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))      

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
time:1 MAE = 3.769045078588411 RMSE = 9.10315268164406 PREC = 0.9247216740900738
time:2 MAE = 5.860892048865113 RMSE = 13.167396292827702 PREC = 0.8863423598829033
time:3 MAE = 7.4616373732417935 RMSE = 15.870358153341577 PREC = 0.8571891814155315
time:4 MAE = 8.757769896245817 RMSE = 18.23490217002666 PREC = 0.8340851232709638
time:5 MAE = 9.860976007698019 RMSE = 19.958764613596486 PREC = 0.8144754363815626
time:6 MAE = 10.793526760250145 RMSE = 21.34063815880051 PREC = 0.798273694865473
RandomForestRegressor(bootstrap=True, criter

In [29]:
# 模型Ada_DT性能
for m in range(len(models)):
    print(models[m])
    print('train error')
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(train_label[:,i,:], train_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))
    print('val error')
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(val_label[:,i,:], val_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))
    print('test error') 
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(test_label[:,i,:], test_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))      

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse',
                                                       max_depth=50,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort=False,
                                                       random_state=None,
                                                       splitter='best'),
                  learning_rate=1.0, loss='linear', n_estimators=10,
                  random_

In [None]:
# 模型GBDT性能
for m in range(len(models)):
    print(models[m])
    print('train error')
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(train_label[:,i,:], train_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))
    print('val error')
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(val_label[:,i,:], val_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))
    print('test error') 
    for i in range(6):
        MAE, RMSE,PREC= Evaluation(test_label[:,i,:], test_predict_all[m][:,i,:])
        print('time:'+str(i+1)+' '+'MAE = '+str(MAE)+' '+'RMSE = '+str(RMSE)+' '+'PREC = '+str(PREC))      