In [1]:
# reference: https://medium.com/@datasciencewizards/guide-to-simple-ensemble-learning-techniques-2ac4e2504912

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
# Let’s load the dataset,
data = load_breast_cancer()
X, y = data.data, data.target

In [3]:
# Let’s have some insights into the data.
# Here we can see the features and target values in the data as well as the shape of the data. 
print("Feature Names: \n", data.feature_names)
print("Target Names: \n", data.target_names)
print("Data Shape: \n", data.data.shape)

Feature Names: 
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Target Names: 
 ['malignant' 'benign']
Data Shape: 
 (569, 30)


In [4]:
# Let’s take a look at the few samples of data.
print("First few samples:")
print(data.data[:5])

First few samples:
[[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
  1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
  6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
  1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
  4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 1.326e+03 8.474e-02 7.864e-02 8.690e-02
  7.017e-02 1.812e-01 5.667e-02 5.435e-01 7.339e-01 3.398e+00 7.408e+01
  5.225e-03 1.308e-02 1.860e-02 1.340e-02 1.389e-02 3.532e-03 2.499e+01
  2.341e+01 1.588e+02 1.956e+03 1.238e-01 1.866e-01 2.416e-01 1.860e-01
  2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 1.203e+03 1.096e-01 1.599e-01 1.974e-01
  1.279e-01 2.069e-01 5.999e-02 7.456e-01 7.869e-01 4.585e+00 9.403e+01
  6.150e-03 4.006e-02 3.832e-02 2.058e-02 2.250e-02 4.571e-03 2.357e+01
  2.553e+01 1.525e+02 1.709e+03 1.444e-01 4.245e-01 4.504e-01 2.430e-01
  3.613e-01 8.758e-02]
 [1.142e+01 2.038e+01 7.758e+01 3.861e+02 1.425e

In [5]:
# split the data to train multiple models and test them.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Max Votes

In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [7]:
# Now we can define the individual classifier.
clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = DecisionTreeClassifier()
clf3 = SVC(probability=True)

In [8]:
# Now the sklearn-provided VotingClassifier module will help us create 
# the voting classifier with majority voting.
voting_clf = VotingClassifier(estimators=[('knn', clf1), ('dt', clf2), ('svm', clf3)], voting='hard')

In [9]:
# This voting classifier can be fit in the above-defined training data and make predictions
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

In [10]:
# Let’s evaluate the performance of the model we made using the majority voting system.
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.956140350877193


# Averaging

In [17]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [18]:
# define 3 different models
clf1 = RandomForestClassifier(random_state=42)
clf2 = GradientBoostingClassifier(random_state=42)
clf3 = LogisticRegression(random_state=42,  max_iter=2000)

In [20]:
# Use StandardScaler to normalize the data for better convergence.
# Without this step, the logistic regression model throws error: 
#
#ConvergenceWarning: lbfgs failed to converge (status=1):
#STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.
#
#Increase the number of iterations (max_iter) or scale the data as shown in:
#    https://scikit-learn.org/stable/modules/preprocessing.html
#Please also refer to the documentation for alternative solver options:
#    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
#  n_iter_i = _check_optimize_result(

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
# Now let’s train them using the training data.

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)

In [21]:
# Let’s make predictions from each model.

clf1_predictions = clf1.predict(X_test)
clf2_predictions = clf2.predict(X_test)
clf3_predictions = clf3.predict(X_test)

In [23]:
import numpy as np

In [24]:
# Now when we have predictions from each of the models, we need to combine 
# the predictions using NumPy-provided averaging methods.

ensemble_predictions = np.column_stack((clf1_predictions, clf2_predictions, clf3_predictions))
averaged_predictions = np.mean(ensemble_predictions, axis=1).astype(int)

In [26]:
# Let’s evaluate the performance of the ensemble.

ensemble_accuracy = accuracy_score(y_test, averaged_predictions)
print("Ensemble Accuracy:", ensemble_accuracy)

Ensemble Accuracy: 0.956140350877193


# Weighted Averaging

In [27]:
# In this type of ensemble learning, we assign weight to each individual model’s prediction
# based on criteria like the importance or performance of the model. The weights can be 
# determined based on various factors, such as the accuracy, confidence, or reliability of 
# the model. To get the final results, individual models are multiplied by their respective weights,
# and the weighted average of these predictions is calculated.
weights = [0.4, 0.3, 0.3]

In [28]:
ensemble_predictions = np.column_stack((clf1_predictions, clf2_predictions, clf3_predictions))

In [29]:
weighted_predictions = np.average(ensemble_predictions, axis=1, weights=weights).astype(int)

In [30]:
ensemble_accuracy = accuracy_score(y_test, weighted_predictions)
print("Ensemble Accuracy:", ensemble_accuracy)

Ensemble Accuracy: 0.956140350877193
