In [1]:
# Reference: https://medium.com/@datasciencewizards/guide-to-advance-ensemble-learning-techniques-f4dd382021b2

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
# import data

data = load_breast_cancer()
X, y = data.data, data.target
print(X)
print('---')
print(y)

[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]
---
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 

In [9]:
print("Feature Names: \n", data.feature_names)
print("====")
print("Target Names: \n", data.target_names)
print("====")
print("Data Shape: \n", data.data.shape)

Feature Names: 
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
====
Target Names: 
 ['malignant' 'benign']
====
Data Shape: 
 (569, 30)


In [10]:
# Data preprocessing
#Let’s split the model.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train set shape:", X_train.shape, y_train.shape)
print("====")
print("Test set shape:", X_test.shape, y_test.shape)

Train set shape: (455, 30) (455,)
====
Test set shape: (114, 30) (114,)


In [30]:
# Use StandardScaler to normalize the data for better convergence.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
# Let’s define three models we will use in the stack ensemble learning.

model1 = LogisticRegression(random_state=42, max_iter=2000)
model2 = RandomForestClassifier(n_estimators=100, random_state=42)
model3 = SVC(probability=True)

In [32]:
# Let’s train the base models in the training data.

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

In [19]:
# Now we are required to make the prediction using the test set and the trained models

pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)
pred3 = model3.predict(X_test)

In [20]:
# After making the prediction, we are ready to train the stacking models using the predictions given by the base models:

import numpy as np

stacking_input = np.column_stack((pred1, pred2, pred3))
stacking_model = LogisticRegression(random_state=42)
stacking_model.fit(stacking_input, y_test)

In [21]:
# Let’s make the predictions on the testing set using the stacking model.

stacking_pred = stacking_model.predict(stacking_input)

In [23]:
# Here we get our final predictions. Now we are ready to evaluate the performance of the stacking model.

stacking_accuracy = accuracy_score(y_test, stacking_pred)
print("Stacking Accuracy:", stacking_accuracy)

Stacking Accuracy: 0.9649122807017544


In [25]:
# Let’s check the performance based on the confusion matrix:

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, stacking_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[39  4]
 [ 0 71]]


In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

data = {'Age': [25, 40, 35], 'Income': [30000, 70000, 60000]}
df = pd.DataFrame(data)

X = df[['Age', 'Income']]
display(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

display(X_scaled)

Unnamed: 0,Age,Income
0,25,30000
1,40,70000
2,35,60000


array([[-1.33630621, -1.37281295],
       [ 1.06904497,  0.98058068],
       [ 0.26726124,  0.39223227]])