In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
import scipy
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pylab as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# Data exploration

In [3]:
df = pd.read_csv('guangzhou.csv')
df.head(1)

Unnamed: 0.1,Unnamed: 0,year,month,day,hour,season,DEWP,HUMI,PRES,TEMP,cbwd,Iws,precipitation,Iprec,PM_Mean,PM_Class
0,0,2013,1,1,0,4.0,3.7,91.0,1014.2,5.0,3,1.9,0.0,0.0,81.333,2


In [4]:
target_reg = df['PM_Mean'].values# target for regression
target_cls = df['PM_Class'].values # target for classification

In [5]:
df.drop(['Unnamed: 0','PM_Class','PM_Mean'], axis=1, inplace=True)

In [6]:
features = df.columns.values
#find categorical attributes and numerical attributes
cat_feats = ['year', 'month', 'day', 'hour', 'season']

#onehotencoder for categorical attributes
train_cat = OneHotEncoder().fit_transform(df[cat_feats])
df.drop(cat_feats, axis=1, inplace=True)
train = scipy.sparse.hstack((train_cat,df)).A

# Use xgboost  to predict the target_cls

In [29]:
from xgboost import XGBClassifier
from sklearn.model_selection import KFold,StratifiedKFold 
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss, f1_score
from sklearn import metrics

In [30]:
x_train, x_valid, y_train, y_valid = train_test_split(train, target_cls, test_size=0.2, random_state=42)

In [34]:
# use the default parameters of xgboost as baseline
model = XGBClassifier()

In [35]:
fit_model = model.fit(x_train, y_train,
         eval_set=[(x_valid,y_valid)],
        verbose=False)

In [36]:
pred =fit_model.predict(x_valid)
print('accuracy score: {:.3f}'.format(accuracy_score(y_valid, pred)))
print(metrics.classification_report(y_valid, pred))

accuracy score: 0.746
             precision    recall  f1-score   support

          1       0.79      0.90      0.84      2572
          2       0.62      0.55      0.58      1195
          3       0.82      0.07      0.13       205
          4       0.75      0.07      0.14        40
          5       0.50      0.33      0.40         3

avg / total       0.74      0.75      0.72      4015



In [None]:
class CustomGridCV(object):
    def __init__(self, X, y, model, metric, griddata, cv=5):
        self.X = X
        self.y = y
        self.model = model
        self.metric = metric
        self.params = self.gridpoints(griddata)
        self.cv = cv
        self.bestScore = None
        self.bestParams = None
        
    def gridpoints(self, data):
        newparams = [{}]
        for k in data.keys():
            params = newparams
            newparams = []
            for v in data[k]:
                for param in params:
                    item = param.copy()
                    item[k]=v
                    newparams.append(item)           
        return newparams
    
    def GridSearch(self):
        for param in self.params:
            self.model.set_params(**param)
            score = self.KFoldScore()
            if self.bestScore==None or self.bestScore<score:
                self.bestScore = score
                self.bestParams = param
            print("Score: {0:.5f}, Params: {1}".format(score,param))
    
    def KFoldScore(self):
        #kf = KFold(n_splits=5, shuffle=True, random_state=2)
        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
        y_pred = np.zeros(len(self.y))

        for train_index, test_index in kf.split(self.X,self.y):
            train_X, test_X = self.X[train_index], self.X[test_index]
            train_y, test_y = self.y[train_index], self.y[test_index]
            self.model.fit(train_X,train_y)
            #y_pred[test_index] = self.model.predict_proba(test_X)[:,1]
            y_pred[test_index] = self.model.predict(test_X)
            #print y_pred[test_index]

        return self.metric(self.y,y_pred)
    
    def Best(self):
        return self.bestScore, self.bestParams

In [37]:
model

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [None]:
model = XGBClassifier()
griddata = {'max_depth':range(3,8),
            'min_child_weight':range(1,5),
            'learning_rate': np.arange(0.01,0.2,0.02)
            }
GCV = CustomGridCV(train_scale, y, model, accuracy_score, griddata)
GCV.GridSearch()

Score: 0.69328, Params: {'max_depth': 3, 'min_child_weight': 1, 'learning_rate': 0.01}
Score: 0.70190, Params: {'max_depth': 4, 'min_child_weight': 1, 'learning_rate': 0.01}
Score: 0.72576, Params: {'max_depth': 5, 'min_child_weight': 1, 'learning_rate': 0.01}
Score: 0.74106, Params: {'max_depth': 6, 'min_child_weight': 1, 'learning_rate': 0.01}
Score: 0.75939, Params: {'max_depth': 7, 'min_child_weight': 1, 'learning_rate': 0.01}
Score: 0.69343, Params: {'max_depth': 3, 'min_child_weight': 2, 'learning_rate': 0.01}
Score: 0.70185, Params: {'max_depth': 4, 'min_child_weight': 2, 'learning_rate': 0.01}
Score: 0.72567, Params: {'max_depth': 5, 'min_child_weight': 2, 'learning_rate': 0.01}
Score: 0.74066, Params: {'max_depth': 6, 'min_child_weight': 2, 'learning_rate': 0.01}
Score: 0.75655, Params: {'max_depth': 7, 'min_child_weight': 2, 'learning_rate': 0.01}
Score: 0.69348, Params: {'max_depth': 3, 'min_child_weight': 3, 'learning_rate': 0.01}
Score: 0.70215, Params: {'max_depth': 4, 'm

In [None]:
print("Best Params:")
print(GCV.Best())

# Use SVM

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_scale = scaler.fit_transform(train)
y = target_cls

In [19]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import  StratifiedKFold, KFold 
from sklearn.metrics import  accuracy_score, log_loss
from sklearn import metrics
#from sklearn import metrics

In [12]:
model_svm = SVC(kernel='rbf', C=100, gamma=1e-8, random_state=0)
y_pred_svm = cross_val_predict(model_svm, train_scale, y, cv=5)

NameError: name 'accuracy_score' is not defined

In [20]:
print(accuracy_score(y, y_pred_svm))
print(metrics.classification_report(y, y_pred_svm))

0.630517086779
             precision    recall  f1-score   support

          1       0.63      1.00      0.77     12657
          2       0.00      0.00      0.00      6051
          3       0.00      0.00      0.00      1106
          4       0.00      0.00      0.00       241
          5       0.00      0.00      0.00        19

avg / total       0.40      0.63      0.49     20074



In [None]:
model_svm = SVC(kernel='linear', C=10, gamma=1e-8, random_state=0)
y_pred_svm = cross_val_predict(model_svm, train, y, cv=5)
#print(metrics.classification_report(y, y_pred_svm))
print(accuracy_score(y, y_pred_svm))

In [23]:
model = SVC(kernel='rbf', gamma=1e-8, random_state=0)
griddata = {'C':[1.0,10.0,100.0,1e3]}
GCV = CustomGridCV(train_scale, y, model, accuracy_score, griddata)
GCV.GridSearch()

Score: 0.63052, Params: {'C': 1.0}
Score: 0.63052, Params: {'C': 10.0}
Score: 0.63052, Params: {'C': 100.0}
Score: 0.63052, Params: {'C': 1000.0}


In [None]:
n_folds = 5
skf = StratifiedKFold(folds, shuffle=True, random_state=99)
for i, (train_index,valid_index) in enumerate(skf.split(train, y)):
    print("=====Round {0}/{1}=====".format(i+1,n_folds))
    x_train, x_valid = train[train_index], train[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    model_svm = SVC(kernel='rbf', C=5, gamma=1e-8, random_state=0)
   


In [None]:
  watchlist = [(xgb.DMatrix(x_train,y_train), 'train'), (xgb.DMatrix(x_valid, y_valid), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x_train,y_train), 1000, watchlist, feval=gini_xgb,
                    maximize=True, verbose_eval=100,  early_stopping_rounds=100)

In [259]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

In [260]:
indices

array([76, 74, 75, 77, 79,  3,  2, 81,  0, 30,  1, 78, 35, 73, 72, 29, 20,
        6,  8, 45, 10, 41, 27, 80, 13,  5, 26, 16, 17, 11, 33, 71, 14,  4,
       39, 22, 70, 12,  9, 36, 48, 37, 28, 32, 25, 31, 46, 55, 66, 19, 42,
       56, 24, 21, 60, 38, 49, 40, 34, 43, 54, 65, 23, 69, 15, 18,  7, 67,
       51, 58, 52, 68, 44, 64, 62, 63, 50, 61, 53, 59, 57, 47])

In [230]:
# Print the feature ranking
print("Feature ranking:")

for f in range(len(features)):
    print("%d. %s (%f)" % (f + 1, features[indices[f]], importances[indices[f]]))

Feature ranking:
1. PRES (0.142529)
2. DEWP (0.133333)
3. month (0.128736)
4. day (0.119540)
5. HUMI (0.088998)
6. TEMP (0.078489)
7. Iws (0.072906)
8. hour (0.048604)
9. Iprec (0.040066)
10. cbwd (0.030870)
11. 2015 (0.030542)
12. 4.0 (0.019704)
13. 2.0 (0.014450)
14. 2014 (0.013465)
15. precipitation (0.012479)
16. 2013 (0.010181)
17. 3.0 (0.007882)
18. 1.0 (0.007225)
