# Boosting Algorithms  
Build a machine learning model by using boosting algorithms (Adaptive boosting, gradient boosting & extreme gradient boosting (XGBoost)) to predict whether or not a mushroom is edible.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv('https://tinyurl.com/myboosting')

In [3]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
#libraries that needed 
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import time

In [5]:
data.shape

(8124, 23)

In [7]:
data.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [8]:
#there are many categorical data, need to convert them into integers
encoder=LabelEncoder()
for col in data.columns:
    data[col]=encoder.fit_transform(data[col])

data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [9]:
print(data.groupby('class').size())

class
0    4208
1    3916
dtype: int64


Poinsonous(1), Edible (0)

In [12]:
#train test split, use 30% of the data as test set 
Y=data['class'].values
X=data.drop('class', axis=1).values
X_train, X_test, Y_train, Y_test=train_test_split(X,Y,test_size =0.3, 
                                                  random_state=21)

In [13]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.2.1-py3-none-macosx_10_13_x86_64.macosx_10_14_x86_64.macosx_10_15_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 10.6 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.2.1


In [20]:
import xgboost as xgb
from xgboost import XGBClassifier

In [21]:
ensembles=[]
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier()))
ensembles.append(('ET', ExtraTreesClassifier()))
ensembles.append(('XGB', XGBClassifier()))

In [23]:
import warnings 
results=[]
names=[]
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    for name, model in ensembles:
        kfold=KFold(n_splits=10, random_state=21)
        cv_results=cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
        results.append(cv_results)
        names.append(name)
        msg="%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

AB: 1.000000 (0.000000)
GBM: 1.000000 (0.000000)
RF: 1.000000 (0.000000)
ET: 1.000000 (0.000000)
XGB: 1.000000 (0.000000)


All the algorithms hit 100% accuracy in the test!

# AdaBoost
- The algorithm first trains a base classifier (such as Decision Tree) and uses it to make predictions on the training set. The algorithm increases the relative weight of misclassified training instances. Then it trains a second classifier, using the updated weights, and so on.

# Gradient Boosting
- works by sequentially adding predictors to an ensemble, each one correcting its predecessor. This method tries to fit the new predictor to the residua errors made by the previous predictor.



In [24]:
model_XGB=XGBClassifier(n_estimators=100)

In [25]:
model_XGB.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [29]:
predictions=model_XGB.predict(X_test)
predictions

array([0, 0, 1, ..., 0, 0, 1])

In [30]:
print(confusion_matrix(Y_test, predictions))

[[1268    0]
 [   0 1170]]


There are no errors from this method! This seems imposible!
For all difference boosting algorithms, 100% accuracy returned. We can predict edible/poinsoned mushroom correctly.