In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN 
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append('../data')
    sys.path.append('../src')
from utils import plot_labeled_decision_regions

SEED = 1

### Read data

In [2]:
df = pd.read_csv(r"../data/wbc.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

### Convert the target into binary

In [3]:
y = df["diagnosis"].replace({"M":1, "B":0})
X = df.drop(["id", "diagnosis", "Unnamed: 32"], axis=1)

### Split data into train and test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=SEED)

### AdaBoost

In [5]:
# Instantiate dt
dt = DecisionTreeClassifier(max_depth=1, random_state=SEED)

# Instanciate the Bagging classifier
adb_clf = AdaBoostClassifier(estimator=dt, n_estimators=100)

adb_clf.fit(X_train, y_train)

# Predict the test set probabilities of positive cases
y_pred_proba = adb_clf.predict_proba(X_test)[:,1]

adb_clf_roc_auc_score = roc_auc_score(y_test, y_pred_proba)

print("ROC AUC score: {:.2f}".format(adb_clf_roc_auc_score))

ROC AUC score: 0.99


### Bagging

In [6]:
# Instantiate dt
dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.16, random_state=SEED)

# Instanciate the Bagging classifier
bc = BaggingClassifier(estimator=dt, n_estimators=300, n_jobs=-1)

bc.fit(X_train, y_train)

y_pred = bc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy of Bagging Classifier: {:.3f}".format(accuracy))

Accuracy of Bagging Classifier: 0.936


### Out of Bag Evaluation

Evaluate the OOB accuracy of an ensemble classifier by setting the parameter oob_score to True during instantiation. 
After training the classifier, the OOB accuracy can be obtained by accessing the .oob_score_ attribute from the corresponding instance.

In [7]:
# Instantiate dt
dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.16, random_state=SEED)

# Instanciate the Bagging classifier
bc = BaggingClassifier(estimator=dt, 
                       n_estimators=300, 
                       oob_score=True,
                       n_jobs=-1) # oob_score is accuracy for classification and r2 for regressors 

bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)

oob_accuracy = bc.oob_score_

print("Test set accuracy: {:.3f}".format(test_accuracy))

print("OOB set accuracy: {:.3f}".format(oob_accuracy))


Test set accuracy: 0.930
OOB set accuracy: 0.925
