## Baggging and Boosting

In [2]:
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import graphviz
import pydotplus

In [4]:
df_risk = pd.read_csv('data/ClassifyRisk.csv', usecols=['age','marital_status','income','risk'])
df_risk['risk'].value_counts()

bad loss     123
good risk    123
Name: risk, dtype: int64

In [5]:
df_risk = pd.get_dummies(df_risk,columns=['marital_status'],prefix=['marital_status'])
df_risk.head()

Unnamed: 0,age,income,risk,marital_status_married,marital_status_other,marital_status_single
0,34,28060.7,bad loss,0,1,0
1,37,28009.34,bad loss,0,1,0
2,29,27614.6,bad loss,0,1,0
3,33,27287.18,bad loss,0,1,0
4,39,26954.06,bad loss,0,1,0


In [6]:
#split dataset
X = df_risk[['age','income','marital_status_married','marital_status_other','marital_status_single']]
y = df_risk[['risk']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

In [7]:
#Tree classifier
clf_CART = DecisionTreeClassifier(criterion='gini')
clf_CART.fit(X_train, y_train.values.ravel())
y_pred_CART = clf_CART.predict(X_test)

In [8]:
#Bagging
bag = BaggingClassifier(n_estimators=5)
bag.fit(X_train, y_train.values.ravel())
y_pred_bag = bag.predict(X_test)

In [9]:
#Boosting
ada = AdaBoostClassifier(n_estimators=5)
ada.fit(X_train, y_train.values.ravel())
y_pred_ada = ada.predict(X_test)

In [26]:
#Comparison with confusion matrices
print ('Confusin matrix for CART decision tree')
print(confusion_matrix(y_test, y_pred_CART))
print('\n')
print('Confusin matrix for Bagging classifier')
print(confusion_matrix(y_test, y_pred_bag))
print('\n')
print ('Confusin matrix for AdaBoost classifier')
print(confusion_matrix(y_test, y_pred_ada))
print('\n')

Confusin matrix for CART decision tree
[[32  3]
 [ 3 36]]


Confusin matrix for Bagging classifier
[[33  2]
 [ 3 36]]


Confusin matrix for AdaBoost classifier
[[31  4]
 [ 3 36]]




In [24]:
#Comparison of the errors of the classifier
print ('Classification report for CART decision tree')
print(classification_report(y_test,y_pred_CART))
print('\n')
print ('Classification report for Bagging classifier')
print(classification_report(y_test,y_pred_bag))
print('\n')
print ('Classification report for AdaBoost classifier')
print(classification_report(y_test,y_pred_ada))
print('\n')

Classification report for CART decision tree
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        35
           1       0.92      0.92      0.92        39

    accuracy                           0.92        74
   macro avg       0.92      0.92      0.92        74
weighted avg       0.92      0.92      0.92        74



Classification report for Bagging classifier
              precision    recall  f1-score   support

           0       0.92      0.94      0.93        35
           1       0.95      0.92      0.94        39

    accuracy                           0.93        74
   macro avg       0.93      0.93      0.93        74
weighted avg       0.93      0.93      0.93        74



Classification report for AdaBoost classifier
              precision    recall  f1-score   support

           0       0.91      0.89      0.90        35
           1       0.90      0.92      0.91        39

    accuracy                           0.9

In [16]:
#Overall error rate, MSE, MAE
print ('CART overall error rate: {:.5f}'.format(1-accuracy_score(y_test,y_pred_CART)))
print ('Bagging overall error rate: {:.5f}'.format(1-accuracy_score(y_test,y_pred_bag)))
print ('Adaboost overall error rate: {:.5f}'.format(1-accuracy_score(y_test,y_pred_ada)))
print('\n')
labels = y_test['risk'].astype('category').cat.categories.tolist()
replace_map = {'risk':{ k: v for k,v in zip(labels,list(range(0,len(labels)+1)))}}
y_test.replace(replace_map, inplace=True)

y_pred_CART = [0 if i=='bad loss' else 1 for i in y_pred_CART]
y_pred_bag = [0 if i=='bad loss' else 1 for i in y_pred_bag]
y_pred_ada = [0 if i=='bad loss' else 1 for i in y_pred_ada]

CART overall error rate: 0.08108
Bagging overall error rate: 0.06757
Adaboost overall error rate: 0.09459




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regex=regex,


In [17]:
#Overall error rate, MSE, MAE
print('CART MSE: {:.5f}'.format(MSE(y_test,y_pred_CART)))
print('CART MSE: {:.5f}'.format(MSE(y_test,y_pred_bag)))
print('CART MSE: {:.5f}'.format(MSE(y_test,y_pred_ada)))
print('\n')
print('CART MAE: {:.5f}'.format(MAE(y_test,y_pred_CART)))
print('CART MAE: {:.5f}'.format(MAE(y_test,y_pred_bag)))
print('CART MAE: {:.5f}'.format(MAE(y_test,y_pred_ada)))

CART MSE: 0.08108
CART MSE: 0.06757
CART MSE: 0.09459


CART MAE: 0.08108
CART MAE: 0.06757
CART MAE: 0.09459
