# Predicting Particulate Matter (PM2.5) Concentrations in the Air of China

# Implementation

In [7]:
# Import libraries necessary for this project
import csv
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
from time import time
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# Dara preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from math import sqrt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
# Import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit


In [8]:
# Load Guangzhou dataset
Guangzhou = pd.read_csv('dataset_Guangzhou_clean.csv')
Guangzhou = Guangzhou.drop('Unnamed: 0', axis = 1)

# Load Beijing dataset
Beijing = pd.read_csv('dataset_Beijing_clean.csv')
Beijing = Beijing.drop('Unnamed: 0', axis = 1)

# Load Chengdu dataset
Chengdu = pd.read_csv('dataset_Chengdu_clean.csv')
Chengdu = Chengdu.drop('Unnamed: 0', axis = 1)

# Load Shanghai dataset
Shanghai = pd.read_csv('dataset_Shanghai_clean.csv')
Shanghai = Shanghai.drop('Unnamed: 0', axis = 1)

# Load Shenyang dataset
Shenyang = pd.read_csv('dataset_Shenyang_clean.csv')
Shenyang = Shenyang.drop('Unnamed: 0', axis = 1)

dataset = Beijing.append(Chengdu, ignore_index=True, sort=True)
dataset = dataset.append(Shanghai, ignore_index=True, sort=True)
dataset = dataset.append(Shenyang, ignore_index=True, sort=True)
dataset = dataset.append(Guangzhou, ignore_index=True, sort=True)



# Total number of records
n_records = len(dataset["PM_US Post"])
print("Number of records for all Chines cities: ", n_records)
print("*********************")
display(dataset.info())
display(dataset.describe())




Number of records for all Chines cities:  117200
*********************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117200 entries, 0 to 117199
Data columns (total 23 columns):
DEWP             117200 non-null float64
HUMI             117200 non-null float64
PM_US Post       117200 non-null float64
PRES             117200 non-null float64
TEMP             117200 non-null float64
cbwd_NE          117200 non-null int64
cbwd_NW          117200 non-null int64
cbwd_SE          117200 non-null int64
cbwd_SW          117200 non-null int64
cbwd_cv          117200 non-null int64
day_cos          117200 non-null float64
day_sin          117200 non-null float64
hour_cos         117200 non-null float64
hour_sin         117200 non-null float64
month_cos        117200 non-null float64
month_sin        117200 non-null float64
new_wind         117200 non-null float64
precipitation    117200 non-null float64
season_1         117200 non-null int64
season_2         117200 non-null int64
season_3    

None

Unnamed: 0,DEWP,HUMI,PM_US Post,PRES,TEMP,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_SW,cbwd_cv,...,hour_sin,month_cos,month_sin,new_wind,precipitation,season_1,season_2,season_3,season_4,year
count,117200.0,117200.0,117200.0,117200.0,117200.0,117200.0,117200.0,117200.0,117200.0,117200.0,...,117200.0,117200.0,117200.0,117200.0,117200.0,117200.0,117200.0,117200.0,117200.0,117200.0
mean,8.989047,66.604351,71.029445,1013.568459,16.191724,0.221468,0.246937,0.222082,0.143567,0.165947,...,-0.002445,0.006348879,-0.009274,2.757414,0.124447,0.248857,0.242585,0.255222,0.253336,2014.011314
std,12.318217,22.354439,65.563051,9.906103,10.734711,0.415236,0.431232,0.415648,0.350651,0.372035,...,0.707228,0.7050057,0.709119,1.943324,1.11776,0.432352,0.428648,0.435988,0.434924,0.809783
min,-40.0,2.0,1.0,975.0,-25.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.0
25%,1.0,51.56,29.0,1006.0,9.0,0.0,0.0,0.0,0.0,0.0,...,-0.707107,-0.5,-0.866025,1.1,0.0,0.0,0.0,0.0,0.0,2013.0
50%,12.0,70.36,52.0,1013.0,18.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.83697e-16,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2014.0
75%,19.0,86.0,90.0,1021.0,24.4,0.0,0.0,0.0,0.0,0.0,...,0.707107,0.8660254,0.5,4.0,0.0,0.0,0.0,1.0,1.0,2015.0
max,28.0,100.0,932.0,1046.0,42.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,20.12,48.6,1.0,1.0,1.0,1.0,2015.0


In [9]:
# Machine learning algorithms decleration
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

LR = LinearRegression()
RF = RandomForestRegressor(n_estimators = 96)
ANN = MLPRegressor(hidden_layer_sizes= (128, 256))
SVR = SVR(kernel='rbf')
#SVR_tuned = SVR(kernel='rbf', C = 707, epsilon = 4)

MLs = {'LR' : LR, 'RF': RF, 'ANN' : ANN, 'SVR' : SVR}
#MLs = {'LR' : LR}


In [10]:


def apply_L5(cityTrainName, dataset, cityTestName, MLname, estimator, f_out, Un_needed_columns):
    # Construct the pipeline with a standard scaler and a small neural network
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append((MLname, estimator))
    model = Pipeline(estimators)

    # Split the data into features and target label
    #Un_needed_columns = ['PM_US Post', 'precipitation', 'day_cos', 'day_sin']
    # Split the data into features and target label
    Features = dataset.drop(Un_needed_columns, axis = 1)
    Target = dataset['PM_US Post']

    # Saving feature names for later use
    features_list = list(Features.columns)

    display(Features.head(n=2))
    display(Target.head(n=2))
    
    #Shuffle and Split Data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(Features, Target, test_size=0.20, random_state=0, shuffle=True)

    
    # Feature Scaling
    #Sc_X = StandardScaler()
    #X_train = Sc_X.fit_transform(X_train)
    #X_test = Sc_X.transform(X_test)
    

    print(estimators)

    print("**Train Split **")
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    R2 = r2_score(y_test, predict)
    MSE =  mean_squared_error(y_test,predict)
    MAE =  mean_absolute_error(y_test,predict)
    RMSE = sqrt(MSE)

    print("Train City : ", cityTrainName)
    print("Test City : ", cityTestName)
    print("MSE : ", MSE)
    print("MAE : ", MAE)
    print("R2 : ", R2)
    print("RMSE : ", RMSE)

    print("-----------------------------------------------")

    
    f_out.write(str(cityTrainName) + ",")
    f_out.write(str(cityTestName) + ",")
    f_out.write(str(MLname) + ",")
    f_out.write('TTS' + ",")
    f_out.write(str(abs(MAE)) + ",")
    f_out.write(str(abs(MSE)) + ",")
    f_out.write(str(RMSE) + ",")
    f_out.write(str(R2) + ",")
    f_out.write(str(len(features_list)) + ",")
    for feature in features_list:
        f_out.write(feature + "&")
    f_out.write("\n")




In [11]:


def applyCV(cityTrainName, dataset, cityTestName, MLname, estimator, f_out, Un_needed_columns):
    # Construct the pipeline with a standard scaler and a small neural network
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append((MLname, estimator))
    model = Pipeline(estimators)

    # Split the data into features and target label
    #Un_needed_columns = ['PM_US Post', 'precipitation', 'day_cos', 'day_sin']
    
    # Split the data into features and target label
    X = dataset.drop(Un_needed_columns, axis = 1)
    y = dataset['PM_US Post']
    
    # Saving feature names for later use
    features_list = list(X.columns)

    # We'll use 5-fold cross validation. That is, a random 80% of the data will be used
    # to train the model, and the prediction score will be computed on the remaining 20%.
    # This process is repeated five times such that the training sets in each "fold"
    # are mutually orthogonal.
    
    K = 3
    kfold = KFold(n_splits=K,  shuffle=True)

    print(estimators)

    print("**cross_val_score + KFold **")

    results_R2 = cross_val_score(model, X, y, cv=kfold, scoring='r2')
    R2 = np.mean(results_R2)
    print('CV Scoring Result: r2 : mean=',np.mean(results_R2),'std=',np.std(results_R2))
    #print(results_R2) 
    print("**************")
    
    results_MAE = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')
    MAE = np.mean(results_MAE)
    print('CV Scoring Result: MAE : mean=',np.mean(results_MAE),'std=',np.std(results_MAE))
    #print(results_MAE)  
    print("**************")

    results_MSE = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
    MSE = np.mean(results_MSE)
    print('CV Scoring Result: MSE : mean=',np.mean(results_MSE),'std=',np.std(results_MSE))
    #print(results_MSE) 
    
    RMSE = sqrt(abs(MSE))
    
    print("-----------------------------------------------")

    
    f_out.write(str(cityTrainName) + ",")
    f_out.write(str(cityTestName) + ",")
    f_out.write(str(MLname) + ",")
    f_out.write('CV(' + str(K) + "),")
    f_out.write(str(abs(MAE)) + ",")
    f_out.write(str(abs(MSE)) + ",")
    f_out.write(str(RMSE) + ",")
    f_out.write(str(R2) + ",")
    f_out.write(str(len(features_list)) + ",")
    for feature in features_list:
        f_out.write(feature + "&")
    f_out.write("\n")




In [12]:
with open("../China/China_Results/Paper_Results/Level_5_Balanced_Test_20_features.csv", 'w') as f_out:
    out_colnames = ['Train Site', 'Test Site', 'Algorithm', 'CV', 'MAE', 'MSE', 'RMSE', 'R^2', 'Features_Count', 'Features']        
    writer = csv.DictWriter(f_out, fieldnames = out_colnames)
    writer.writeheader()
    
    Train_city_Name = "Beijing & Chengdu & Shanghai & Shenyang & Guangzhou"
    Test_city_Name = "Beijing & Chengdu & Shanghai & Shenyang & Guangzhou"
    
    for MLname, ML in MLs.items():
        print(Train_city_Name, " ********************** and *********************  ", Test_city_Name)
        Un_needed_columns = ['PM_US Post','day_cos', 'day_sin', ]
        apply_L5(Train_city_Name, dataset, Test_city_Name, MLname, ML, f_out, Un_needed_columns)
        applyCV(Train_city_Name, dataset, Test_city_Name, MLname, ML, f_out, Un_needed_columns)

        

Beijing & Chengdu & Shanghai & Shenyang & Guangzhou  ********************** and *********************   Beijing & Chengdu & Shanghai & Shenyang & Guangzhou


Unnamed: 0,DEWP,HUMI,PRES,TEMP,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_SW,cbwd_cv,hour_cos,hour_sin,month_cos,month_sin,new_wind,precipitation,season_1,season_2,season_3,season_4,year
0,-10.0,67.0,1018.0,-5.0,0,1,0,0,0,1.0,0.0,1.0,0.0,4.02,0.0,0,0,0,1,2013
1,-11.0,73.0,1017.0,-7.0,0,1,0,0,0,0.965926,0.258819,1.0,0.0,4.02,0.0,0,0,0,1,2013


0    31.0
1    32.0
Name: PM_US Post, dtype: float64

[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))]
**Train Split **
Train City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
Test City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
MSE :  3269.1737233924378
MAE :  38.69939533526204
R2 :  0.2390504862425702
RMSE :  57.176688636125455
-----------------------------------------------
[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))]
**cross_val_score + KFold **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


CV Scoring Result: r2 : mean= 0.24063429431947328 std= 0.0016314355821459015
**************


  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


CV Scoring Result: MAE : mean= -38.8573886093727 std= 0.058605315795455394
**************


  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MSE : mean= -3263.625406186114 std= 69.33631990954405
-----------------------------------------------
Beijing & Chengdu & Shanghai & Shenyang & Guangzhou  ********************** and *********************   Beijing & Chengdu & Shanghai & Shenyang & Guangzhou


Unnamed: 0,DEWP,HUMI,PRES,TEMP,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_SW,cbwd_cv,hour_cos,hour_sin,month_cos,month_sin,new_wind,precipitation,season_1,season_2,season_3,season_4,year
0,-10.0,67.0,1018.0,-5.0,0,1,0,0,0,1.0,0.0,1.0,0.0,4.02,0.0,0,0,0,1,2013
1,-11.0,73.0,1017.0,-7.0,0,1,0,0,0,0.965926,0.258819,1.0,0.0,4.02,0.0,0,0,0,1,2013


0    31.0
1    32.0
Name: PM_US Post, dtype: float64

[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('RF', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=96, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]
**Train Split **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


Train City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
Test City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
MSE :  1410.3672230010252
MAE :  23.56731802152744
R2 :  0.6717157473514859
RMSE :  37.55485618400136
-----------------------------------------------
[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('RF', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=96, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]
**cross_val_score + KFold **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: r2 : mean= 0.6581701108540972 std= 0.005588854856492287
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MAE : mean= -24.370167984439252 std= 0.14363333613204698
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MSE : mean= -1468.9545356261076 std= 39.04628173610683
-----------------------------------------------
Beijing & Chengdu & Shanghai & Shenyang & Guangzhou  ********************** and *********************   Beijing & Chengdu & Shanghai & Shenyang & Guangzhou


Unnamed: 0,DEWP,HUMI,PRES,TEMP,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_SW,cbwd_cv,hour_cos,hour_sin,month_cos,month_sin,new_wind,precipitation,season_1,season_2,season_3,season_4,year
0,-10.0,67.0,1018.0,-5.0,0,1,0,0,0,1.0,0.0,1.0,0.0,4.02,0.0,0,0,0,1,2013
1,-11.0,73.0,1017.0,-7.0,0,1,0,0,0,0.965926,0.258819,1.0,0.0,4.02,0.0,0,0,0,1,2013


0    31.0
1    32.0
Name: PM_US Post, dtype: float64

[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ANN', MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(128, 256), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]
**Train Split **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


Train City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
Test City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
MSE :  1864.8408509727903
MAE :  28.621320531114492
R2 :  0.5659301527389675
RMSE :  43.18380310918424
-----------------------------------------------
[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ANN', MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(128, 256), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]
**cross_val_score + KFold **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: r2 : mean= 0.5552119140208865 std= 0.0026756698346851134
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MAE : mean= -29.419270962408323 std= 0.36245368890341023
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MSE : mean= -1929.4776853561882 std= 36.65673932333148
-----------------------------------------------
Beijing & Chengdu & Shanghai & Shenyang & Guangzhou  ********************** and *********************   Beijing & Chengdu & Shanghai & Shenyang & Guangzhou


Unnamed: 0,DEWP,HUMI,PRES,TEMP,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_SW,cbwd_cv,hour_cos,hour_sin,month_cos,month_sin,new_wind,precipitation,season_1,season_2,season_3,season_4,year
0,-10.0,67.0,1018.0,-5.0,0,1,0,0,0,1.0,0.0,1.0,0.0,4.02,0.0,0,0,0,1,2013
1,-11.0,73.0,1017.0,-7.0,0,1,0,0,0,0.965926,0.258819,1.0,0.0,4.02,0.0,0,0,0,1,2013


0    31.0
1    32.0
Name: PM_US Post, dtype: float64

[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SVR', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False))]
**Train Split **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


Train City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
Test City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
MSE :  3031.90429025787
MAE :  32.77141147227638
R2 :  0.29427852704117685
RMSE :  55.06273050129161
-----------------------------------------------
[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SVR', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False))]
**cross_val_score + KFold **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: r2 : mean= 0.2885361596587062 std= 0.001801561377853273
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MAE : mean= -33.093196367077105 std= 0.2519402488797445
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MSE : mean= -3057.434320572098 std= 49.721066256608474
-----------------------------------------------


In [13]:
with open("../China/China_Results/Paper_Results/Level_5_Balanced_Test_22_features.csv", 'w') as f_out:
    out_colnames = ['Train Site', 'Test Site', 'Algorithm', 'CV', 'MAE', 'MSE', 'RMSE', 'R^2', 'Features_Count', 'Features']        
    writer = csv.DictWriter(f_out, fieldnames = out_colnames)
    writer.writeheader()
    
    Train_city_Name = "Beijing & Chengdu & Shanghai & Shenyang & Guangzhou"
    Test_city_Name = "Beijing & Chengdu & Shanghai & Shenyang & Guangzhou"
    
    for MLname, ML in MLs.items():
        print(Train_city_Name, " ********************** and *********************  ", Test_city_Name)
        Un_needed_columns = ['PM_US Post','day_cos', 'day_sin', 'precipitation']
        apply_L5(Train_city_Name, dataset, Test_city_Name, MLname, ML, f_out, Un_needed_columns)
        applyCV(Train_city_Name, dataset, Test_city_Name, MLname, ML, f_out, Un_needed_columns)

        

Beijing & Chengdu & Shanghai & Shenyang & Guangzhou  ********************** and *********************   Beijing & Chengdu & Shanghai & Shenyang & Guangzhou


Unnamed: 0,DEWP,HUMI,PRES,TEMP,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_SW,cbwd_cv,day_cos,...,hour_sin,month_cos,month_sin,new_wind,precipitation,season_1,season_2,season_3,season_4,year
0,-10.0,67.0,1018.0,-5.0,0,1,0,0,0,-0.222521,...,0.0,1.0,0.0,4.02,0.0,0,0,0,1,2013
1,-11.0,73.0,1017.0,-7.0,0,1,0,0,0,-0.222521,...,0.258819,1.0,0.0,4.02,0.0,0,0,0,1,2013


0    31.0
1    32.0
Name: PM_US Post, dtype: float64

[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))]
**Train Split **
Train City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
Test City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
MSE :  3268.3927187825984
MAE :  38.69712188355777
R2 :  0.23923227685034587
RMSE :  57.16985848139383
-----------------------------------------------
[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))]
**cross_val_score + KFold **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


CV Scoring Result: r2 : mean= 0.24086734626175055 std= 0.0038175550707402637
**************


  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


CV Scoring Result: MAE : mean= -38.84013446576253 std= 0.1909324624725233
**************


  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MSE : mean= -3263.2801292329054 std= 47.39969671445359
-----------------------------------------------
Beijing & Chengdu & Shanghai & Shenyang & Guangzhou  ********************** and *********************   Beijing & Chengdu & Shanghai & Shenyang & Guangzhou


Unnamed: 0,DEWP,HUMI,PRES,TEMP,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_SW,cbwd_cv,day_cos,...,hour_sin,month_cos,month_sin,new_wind,precipitation,season_1,season_2,season_3,season_4,year
0,-10.0,67.0,1018.0,-5.0,0,1,0,0,0,-0.222521,...,0.0,1.0,0.0,4.02,0.0,0,0,0,1,2013
1,-11.0,73.0,1017.0,-7.0,0,1,0,0,0,-0.222521,...,0.258819,1.0,0.0,4.02,0.0,0,0,0,1,2013


0    31.0
1    32.0
Name: PM_US Post, dtype: float64

[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('RF', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=96, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]
**Train Split **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


Train City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
Test City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
MSE :  1231.1187545792027
MAE :  21.837643541302686
R2 :  0.7134385331157762
RMSE :  35.087301899393786
-----------------------------------------------
[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('RF', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=96, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]
**cross_val_score + KFold **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: r2 : mean= 0.6930458969070848 std= 0.005545990906094368
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MAE : mean= -22.81800843476316 std= 0.08355443733634062
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MSE : mean= -1311.1461461603633 std= 33.020868343922835
-----------------------------------------------
Beijing & Chengdu & Shanghai & Shenyang & Guangzhou  ********************** and *********************   Beijing & Chengdu & Shanghai & Shenyang & Guangzhou


Unnamed: 0,DEWP,HUMI,PRES,TEMP,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_SW,cbwd_cv,day_cos,...,hour_sin,month_cos,month_sin,new_wind,precipitation,season_1,season_2,season_3,season_4,year
0,-10.0,67.0,1018.0,-5.0,0,1,0,0,0,-0.222521,...,0.0,1.0,0.0,4.02,0.0,0,0,0,1,2013
1,-11.0,73.0,1017.0,-7.0,0,1,0,0,0,-0.222521,...,0.258819,1.0,0.0,4.02,0.0,0,0,0,1,2013


0    31.0
1    32.0
Name: PM_US Post, dtype: float64

[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ANN', MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(128, 256), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]
**Train Split **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


Train City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
Test City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
MSE :  1626.2496180091407
MAE :  27.45176303578029
R2 :  0.6214658623928655
RMSE :  40.32678536666592
-----------------------------------------------
[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ANN', MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(128, 256), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]
**cross_val_score + KFold **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: r2 : mean= 0.5942757702174272 std= 0.0030182647830263353
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MAE : mean= -28.52629768916832 std= 0.09504534574982612
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MSE : mean= -1732.2611022083054 std= 30.379978605387674
-----------------------------------------------
Beijing & Chengdu & Shanghai & Shenyang & Guangzhou  ********************** and *********************   Beijing & Chengdu & Shanghai & Shenyang & Guangzhou


Unnamed: 0,DEWP,HUMI,PRES,TEMP,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_SW,cbwd_cv,day_cos,...,hour_sin,month_cos,month_sin,new_wind,precipitation,season_1,season_2,season_3,season_4,year
0,-10.0,67.0,1018.0,-5.0,0,1,0,0,0,-0.222521,...,0.0,1.0,0.0,4.02,0.0,0,0,0,1,2013
1,-11.0,73.0,1017.0,-7.0,0,1,0,0,0,-0.222521,...,0.258819,1.0,0.0,4.02,0.0,0,0,0,1,2013


0    31.0
1    32.0
Name: PM_US Post, dtype: float64

[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SVR', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False))]
**Train Split **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


Train City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
Test City :  Beijing & Chengdu & Shanghai & Shenyang & Guangzhou
MSE :  3048.876406796789
MAE :  32.816407850507815
R2 :  0.29032801081889326
RMSE :  55.21663161400548
-----------------------------------------------
[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SVR', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False))]
**cross_val_score + KFold **


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: r2 : mean= 0.28436111414054405 std= 0.0005893260031256294
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MAE : mean= -33.15166584940852 std= 0.17161952008791997
**************


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


CV Scoring Result: MSE : mean= -3073.495706207343 std= 70.11814797270871
-----------------------------------------------
