In [28]:
# Reference: https://medium.com/@datasciencewizards/understanding-the-adaboost-algorithm-2e9344d83d9b

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression

In [29]:
# Load the Breast Cancer dataset

data = load_breast_cancer()
X = data.data
y = data.target

In [30]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
def pretty_print(_data):
    pp.pprint(_data)

for key, value in data.items():
    print(key)

print(data['data'])


data
target
frame
target_names
DESCR
feature_names
filename
data_module
[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]


for i in range(2):
    print(f"Data {i + 1}:")
    for j, feature_value in enumerate(X[i]):
        print(f"  {data.feature_names[j]}: {feature_value}")
    print(f"Label {i + 1}: {y[i]}")  # Print corresponding label
    print("-" * 50)

In [37]:
# Let’s take a look at the basic attributes/information of the data.

print('name of the Features \n', data.feature_names)
print('name of the target classes \n', data.target_names)
print('name of the gra \n', data.frame)

name of the Features 
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
name of the target classes 
 ['malignant' 'benign']
name of the gra 
 None


### In this article, we are going to use two different models, one is **Logistic Regression**, and the other is the **AdaBoost model** so that we can compare the results between them.

In [32]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
print(X_train)
print("---")
print(y_train)

[[9.029e+00 1.733e+01 5.879e+01 ... 1.750e-01 4.228e-01 1.175e-01]
 [2.109e+01 2.657e+01 1.427e+02 ... 2.903e-01 4.098e-01 1.284e-01]
 [9.173e+00 1.386e+01 5.920e+01 ... 5.087e-02 3.282e-01 8.490e-02]
 ...
 [1.429e+01 1.682e+01 9.030e+01 ... 3.333e-02 2.458e-01 6.120e-02]
 [1.398e+01 1.962e+01 9.112e+01 ... 1.827e-01 3.179e-01 1.055e-01]
 [1.218e+01 2.052e+01 7.722e+01 ... 7.431e-02 2.694e-01 6.878e-02]]
---
[1 0 1 1 1 0 1 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 1 0 1 0 1 1 0 1 0 0 0 1 0
 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1
 0 1 0 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 1
 1 0 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 0 1 0 1 0 1 0
 1 1 0 1 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1
 0 0 1 1 0 1 0 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 0 0
 1 1 0 0 0 1 1 1 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 0
 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 0 0 1 0 

In [24]:
# Use StandardScaler to normalize the data for better convergence.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
# Logistic Regression Model
log_reg = LogisticRegression()

# AdaBoost classifier
ada_boost = AdaBoostClassifier(n_estimators=200, random_state=42)

In [26]:
# Train both classifiers

log_reg.fit(X_train_scaled, y_train)
ada_boost.fit(X_train_scaled, y_train)

In [29]:
# Make predictions on the test set

y_pred_1 = log_reg.predict(X_test_scaled)
y_pred_2 = ada_boost.predict(X_test_scaled)

In [30]:
# Let's check the accuracy of both models.

accuracy_1 = accuracy_score(y_test, y_pred_1)
accuracy_2 = accuracy_score(y_test, y_pred_2)

print("Accuracy of Logistic Regression:", accuracy_1)
print("Accuracy of AdaBoost Model     :", accuracy_2)

Accuracy of Logistic Regression: 0.9736842105263158
Accuracy of AdaBoost Model     : 0.9736842105263158


In [34]:
precision_1 = precision_score(y_test, y_pred_1)
precision_2 = precision_score(y_test, y_pred_2)

print("Precision of Logistic Regression:", precision_1)
print("Precision of AdaBoost Model     :", precision_2)

Precision of Logistic Regression: 0.9722222222222222
Precision of AdaBoost Model     : 0.9722222222222222


In [35]:
recall_1 = recall_score(y_test, y_pred_1)
recall_2 = recall_score(y_test, y_pred_2)

print("Recall of Logistic Regression:", recall_1)
print("Recall of AdaBoost Model     :", recall_2)

Recall of Logistic Regression: 0.9859154929577465
Recall of AdaBoost Model     : 0.9859154929577465
