# XGBoost Feature importance ranking

In [None]:
###########import packages##########
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf 
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
%matplotlib
###########wrapping root mean square error for later calls##########
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
###########loading data##########
fdata=pd.read_csv('database.csv',encoding="gbk")
raw_data=fdata.loc[:,[                     
                      'Pt at% in metal element',#0
                      'Pd at% in metal element',#1
                      'Au at% in metal element',#2
                      'Ir at% in metal element',#3
                      'Ru at% in metal element',#4
                      'Ni at% in metal element',#5
                      'Co at% in metal element',#6
                      'Fe at% in metal element',#7
                      'Mn at% in metal element',#8
                      'Zn at% in metal element',#9
                      'Cu at% in metal element',#10
                      'Ti at% in metal element',#11
                      'Cr at% in metal element',#12
                      'Mo at% in metal element',#13
                      'Ag at% in metal element',#14
                      'V at% in metal element',#15
                      'Sc at% in metal element',#16
                      'W at% in metal element',#17
                      'Os at% in metal element',#18
                      'Ga at% in metal element',#19
                      'total metal mass ratio wt%',#20
                      'N at%',#21
                      'Sn at%',#22
                      'S at%',#23
                      'O at%',#24
                      'Ti at%',#25
                      'Ta at%',#26
                      'Nb at%',#27
                      'Si at%',#28
                      'W at%',#29
                      'C wt%',#30
                      'Particle diameter （nm）',#31
                      'support BET surface area(m2/g)' ,#32
                      'particle',#33
                      'nanowire',#34
                      'core-shell',#35
                      'Reduction Temperature',#36
                      'Reduction Time/min',#37
                      'mircrowave(0/1)',#38  
                      'polyhydric alcohols',#39
                      'heat reduction',#40
                      'inorganic reducing agent',#41
                      'organic reducing agent',#42
                      'pH',#43
                      'Annealing Temperature',#44
                      'Annealing Time/h',#45
                      'Atomosphere h2',#46
                      'Atomosphere inert',#47
                      'ECSA m2/g',#48
                      'Mass Activity mA mg-1',#49
                      'I/C Ratio(ionomer/catalyst)',#50
                      'Area cm2',#51
                      'Cathodic Loading Amount mgPt cm-2',#52
                      'Anodic Platinum Loading Amount mgPt cm-2',#53
                      'Anodic catalyst type x wt% Pt/C',#54
                      'membrane thickness',#55
                      'Hot Press Temperature',#56
                      'Hot Press Time min',#57
                      'Hot Press Pressure Mpa',#58
                      'Humidity %',#59
                      'GDE for 1',#60
                      'celltemp',#61
                      'Flowing rate of H2 ml min-1',#62
                      'flowing rate of cathode gas(O2/air)',#63
                      'Back Pressure Mpa',#64
                      'Cathode gas oxygen ratio',#65
                      'qualified'#the classification target
                        ]]
###########handling missing values##########
median_raw_data=raw_data.median()
dict_median_raw_data=median_raw_data.to_dict()
data=raw_data.fillna(dict_median_raw_data)
###########train test splitting##########
raw_param=data.iloc[:,0:66]
raw_power=data.iloc[:,66]
X=raw_param.values.astype(np.float32)
y=raw_power.values.astype(np.float32)
###########fix random seed for reproducability##########
seed=78
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15,random_state=seed)
###########XGBoost gridsearch CV for best hyperparameter##########
model_XGBClassifier = xgb.XGBClassifier(random_state=seed)
###########defining the parameters dictionary##########
param = {
         'max_depth': [3,5,7,9],
    'learning_rate': [0.01,0.05,0.1,0.2],
    'n_estimators': [100,200,500,1000],
    'subsample': [0.6,0.7,0.8,0.9],
    'lambda':[0.01,0.1],
    'alpha':[0.01,0.1]
       }
grid = GridSearchCV(model_XGBClassifier,param_grid=param,cv=5)
grid.fit(X_train,y_train)
best_model=grid.best_estimator_
###########generating confusion matrix to evaluate the prediction accuracy##########
print('Best Classifier:',grid.best_params_,'Best Score:', grid.best_score_) 
prediction_train=best_model.predict(X_train)
prediction_test=best_model.predict(X_test)
final_result=classification_report(y_test,prediction_test,output_dict=True)
low_MPD_precision=final_result['0.0']['precision']
high_MPD_precision=final_result['1.0']['precision']
print(low_MPD_precision,high_MPD_precision)
print(classification_report(y_train,prediction_train))
print(classification_report(y_test,prediction_test))
###########evaluating feature importances##########
print(best_model.feature_importances_)
plot_importance(best_model)
plt.show()


# Random Forest classification model accuracy before feature selection 

In [None]:
from sklearn import ensemble
seed=78
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15,random_state=seed)
###########RandomForest gridsearch CV for best hyperparameter##########
model_RandomForestClassifier = ensemble.RandomForestClassifier(random_state=seed)
###########defining the parameters dictionary##########
param = {
     'max_depth': [3,5,7,9],
'n_estimators': [100,200,500,1000,2000],
'criterion': ['gini','entropy'],
'max_features':['auto','sqrt','log2',None]
   }
grid = GridSearchCV(model_RandomForestClassifier,param_grid=param,cv=5)
grid.fit(X_train,y_train)
best_model=grid.best_estimator_
###########generating confusion matrix to evaluate the prediction accuracy##########
print('Best Classifier:',grid.best_params_,'Best Score:', grid.best_score_) 
prediction_train=best_model.predict(X_train)
prediction_test=best_model.predict(X_test)
final_result=classification_report(y_test,prediction_test,output_dict=True)
low_MPD_precision=final_result['0.0']['precision']
high_MPD_precision=final_result['1.0']['precision']
print(low_MPD_precision,high_MPD_precision)
print(classification_report(y_train,prediction_train))
print(classification_report(y_test,prediction_test))


# Feature importance provided by XGBoost after feature selection-27 features

In [None]:
fdata=pd.read_csv('database.csv',encoding="gbk")
raw_data=fdata.loc[:,[                     
                      'Pt at% in metal element',#0
                      'Co at% in metal element',#1
                      'total metal mass ratio wt%',#2
                      'C wt%',#3
                      'Particle diameter （nm）',#4
                      'support BET surface area(m2/g)' ,#5
                      'Reduction Temperature',#6
                      'Reduction Time/min',#7
                      'Annealing Temperature',#8
                      'ECSA m2/g',#9
                      'Mass Activity mA mg-1',#10
                      'I/C Ratio(ionomer/catalyst)',#11
                      'Area cm2',#12
                      'Cathodic Loading Amount mgPt cm-2',#13
                      'Anodic Platinum Loading Amount mgPt cm-2',#14
                      'Anodic catalyst type x wt% Pt/C',#15
                      'membrane thickness',#16
                      'Hot Press Temperature',#17
                      'Hot Press Time min',#18
                      'Hot Press Pressure Mpa',#19
                      'Humidity %',#20
                      'GDE for 1',#21
                      'celltemp',#22
                      'Flowing rate of H2 ml min-1',#23
                      'flowing rate of cathode gas(O2/air)',#24
                      'Back Pressure Mpa',#25
                      'Cathode gas oxygen ratio',#26
                      'qualified'#
                        ]]
###########handling missing values##########
median_raw_data=raw_data.median()
dict_median_raw_data=median_raw_data.to_dict()
data=raw_data.fillna(dict_median_raw_data)
###########train test splitting##########
raw_param=data.iloc[:,0:27]
raw_power=data.iloc[:,27]
X=raw_param.values.astype(np.float32)
y=raw_power.values.astype(np.float32)
###########fix random seed for reproducability##########
seed=78
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15,random_state=seed)
###########XGBoost gridsearch CV for best hyperparameter##########
model_XGBClassifier = xgb.XGBClassifier(random_state=seed)
###########defining the parameters dictionary##########
param = {
         'max_depth': [3,5,7,9],
    'learning_rate': [0.01,0.05,0.1,0.2],
    'n_estimators': [100,200,500,1000],
    'subsample': [0.6,0.7,0.8,0.9],
    'lambda':[0.01,0.1],
    'alpha':[0.01,0.1]
       }
grid = GridSearchCV(model_XGBClassifier,param_grid=param,cv=5)
grid.fit(X_train,y_train)
best_model=grid.best_estimator_
###########generating confusion matrix to evaluate the prediction accuracy##########
print('Best Regressor:',grid.best_params_,'Best Score:', grid.best_score_) 
prediction_train=best_model.predict(X_train)
prediction_test=best_model.predict(X_test)
final_result=classification_report(y_test,prediction_test,output_dict=True)
low_MPD_precision=final_result['0.0']['precision']
high_MPD_precision=final_result['1.0']['precision']
print(low_MPD_precision,high_MPD_precision)
print(classification_report(y_train,prediction_train))
print(classification_report(y_test,prediction_test))
###########evaluating feature importances##########
print(best_model.feature_importances_)
plot_importance(best_model)
plt.show()


# Random forest classification model accuracy after feature selection

In [None]:
from sklearn import ensemble
###########fix random seed for reproducability##########
seed=78
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15,random_state=seed)
###########RF gridsearch CV for best hyperparameter##########
model_RandomForestClassifier = ensemble.RandomForestClassifier(random_state=seed)
###########defining the parameters dictionary##########
param = {
     'max_depth': [3,5,7,9],
'n_estimators': [100,200,400,800,1200,1400,1600],
'criterion': ['gini','entropy'],
'max_features':['auto','sqrt','log2',None]
   }
grid = GridSearchCV(model_RandomForestClassifier,param_grid=param,cv=5)
grid.fit(X_train,y_train)
best_model=grid.best_estimator_
###########generating confusion matrix to evaluate the prediction accuracy##########
print('Best Regressor:',grid.best_params_,'Best Score:', grid.best_score_) 
prediction_train=best_model.predict(X_train)
prediction_test=best_model.predict(X_test)
final_result=classification_report(y_test,prediction_test,output_dict=True)
low_MPD_precision=final_result['0.0']['precision']
high_MPD_precision=final_result['1.0']['precision']
print(low_MPD_precision,high_MPD_precision)
print(seed)
print(classification_report(y_train,prediction_train))
print(classification_report(y_test,prediction_test))
