# Model Pipeline

This notebook is used for training candidate models for air quality dataset.

In [1]:
MODEL_NAME = 'xgboost'

## Modules

In [2]:
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import r2_score, mean_squared_error, make_scorer

## Ensuring reproducibility

In [3]:
CUSTOM_SEED = 42
np.random.seed(CUSTOM_SEED)

## Data Preparation

### Load Dataset

In [4]:
path = '../dataset/Air quality/'
dataset = pd.read_csv(path + 'cleansed_air_quality.csv')
dataset = dataset.set_index('date')
dataset

Unnamed: 0_level_0,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-03-01 00:00:00,7.0,7.0,3.0,2.0,100.0,91.0,-2.3,1020.3,-20.7,0.0,WNW,3.1,Huairou
2013-03-01 00:00:00,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin
2013-03-01 00:00:00,8.0,8.0,6.0,28.0,400.0,52.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Wanliu
2013-03-01 00:00:00,3.0,6.0,13.0,7.0,300.0,85.0,-2.3,1020.8,-19.7,0.0,E,0.5,Changping
2013-03-01 00:00:00,4.0,4.0,14.0,20.0,300.0,69.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Guanyuan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-02-28 23:00:00,15.0,22.0,13.0,34.0,500.0,60.0,7.4,1014.9,-11.9,0.0,N,1.4,Shunyi
2017-02-28 23:00:00,20.0,25.0,6.0,28.0,900.0,54.0,7.0,1009.4,-12.2,0.0,N,1.9,Changping
2017-02-28 23:00:00,10.0,28.0,7.0,48.0,600.0,39.0,8.6,1014.1,-15.9,0.0,NNE,1.3,Nongzhanguan
2017-02-28 23:00:00,15.0,27.0,5.0,53.0,600.0,33.0,8.6,1014.1,-15.9,0.0,NNE,1.3,Guanyuan


### Splitting Dataset

In [5]:
# Split dataset
feat_cols = ["PM10", "SO2", "NO2", "CO", "O3", "TEMP", "PRES", "DEWP", "RAIN", "WSPM"]
excluded_cols = ["wd", "station"]
target = "PM2.5"
X, y = dataset[feat_cols], dataset[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print ("Ukuran data training {}, data testing {}".format(X_train.shape, X_test.shape))

Ukuran data training (219323, 10), data testing (93996, 10)


## Create Pipeline

### Model Definition

In [6]:
## Hyperparameter
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(XGBRegressor(),
                        parameters,
                        cv = 3,
                        n_jobs = 5,
                        verbose=True)

In [7]:
# evaluate model with standardized dataset
estimators = []
estimators.append(('pca', PCA(n_components='mle', svd_solver='full')))
estimators.append(('normalizer', MinMaxScaler()))
estimators.append(('xgboost', xgb_grid))
pipeline = Pipeline(estimators)

## Run Pipeline

In [8]:
pipeline.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  27 out of  27 | elapsed: 37.8min finished
  if getattr(data, 'base', None) is not None and \


AttributeError: 'Pipeline' object has no attribute 'best_params_'

In [9]:
# Dictionary of best parameters
best_pars = pipeline.named_steps['xgboost'].best_params_
print(best_pars)

{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}


## Evaluation

In [10]:
y_pred = pipeline.predict(X_test)

In [18]:
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('R2 Score : {:.2f}'.format(r2))
print('RMSE Score : {:.2f}'.format(rmse))

R2 Score : 0.91
RMSE Score : 19.71


## Save Model

In [14]:
from sklearn.externals import joblib
joblib.dump(pipeline, 'dumpfiles/' + MODEL_NAME + '.pkl')

['dumpfiles/xgboost.pkl']