# Voting Classifiers

In [1]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Create dataset
X, y = make_moons(n_samples=1000, noise=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Define individual classifiers
log_clf = LogisticRegression()
svm_clf = SVC(probability=False)  # We don't need probabilities for hard voting
tree_clf = DecisionTreeClassifier()

# Create a Hard Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('svc', svm_clf),
        ('tree', tree_clf)
    ],
    voting='hard'  # <- this makes it hard voting
)

# Train ensemble model
voting_clf.fit(X_train, y_train)

# Evaluate
y_pred = voting_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.912


In [2]:
for clf in (log_clf, svm_clf, tree_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.852
SVC 0.916
DecisionTreeClassifier 0.892
VotingClassifier 0.908


# Bagging and Pasting in Scikit-Learn

In [3]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True
)
# When we set OOB score = True when creating a BaggingClassifier to request an automatic oob evaluation after training.
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_



0.908

In [4]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.908

In [5]:
bag_clf.oob_decision_function_

array([[0.45168539, 0.54831461],
       [0.54252874, 0.45747126],
       [1.        , 0.        ],
       ...,
       [0.95813953, 0.04186047],
       [0.69196429, 0.30803571],
       [0.05620609, 0.94379391]])

# Random Forests

In [6]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [7]:
accuracy_score(y_test, y_pred_rf)

0.924

# Random Forest Equivalent of Bagging

In [8]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1
)

In [9]:
bag_clf.fit(X_train, y_train)



In [10]:
bag_clf_pred = bag_clf.predict(X_test)

In [11]:
accuracy_score(y_test, bag_clf_pred)

0.916

# Feature Importance

In [12]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
  print(name, score)

sepal length (cm) 0.08836103302022895
sepal width (cm) 0.02103126463282766
petal length (cm) 0.4174837181120607
petal width (cm) 0.4731239842348827


# AdaBoost

In [13]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME", learning_rate=0.5
)

ada_clf.fit(X_train, y_train)



In [14]:
y_pred_ada = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred_ada)

0.928

In [15]:
y_pred_train = ada_clf.predict(X_train)
accuracy_score(y_train, y_pred_train)

0.9226666666666666

**Checking how Logistic Regression works with Adaboost**

In [16]:

from sklearn.linear_model import LogisticRegression

base_clf = LogisticRegression(solver='lbfgs', max_iter=1000)

# 3. Define AdaBoost with Logistic Regression
ada_clf = AdaBoostClassifier(
    LogisticRegression(solver='lbfgs', max_iter=1000),
    n_estimators=50,
    learning_rate=1.0,
    algorithm='SAMME',  # SAMME.R supports probability-based models
    random_state=42
)

# 4. Train
ada_clf.fit(X_train, y_train)

# 5. Predict and evaluate
y_pred = ada_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))




Accuracy: 0.864


# Gradient Boosting

**How it works**

In [17]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

In [18]:
y2 = y-tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

In [19]:
y3 = y2-tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

In [20]:
y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [21]:
from sklearn.metrics import r2_score # import appropriate metric for regression

# ... (your existing code for Gradient Boosting) ...

y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

# Use R-squared for regression instead of accuracy
print("R-squared:", r2_score(y_test, y_pred))

R-squared: 0.701557322588642


**Using sklearn GradientBoostingRegresor**

In [22]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# 1. Load dataset
data = fetch_california_housing()
X, y = data.data, data.target

# 2. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 3. Create and train Gradient Boosting Regressor
gbrt = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gbrt.fit(X_train, y_train)

# 4. Predict and evaluate
y_pred = gbrt.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error:", mse)


Mean Squared Error: 0.28952296442116765


**implementing early stopping using staged_predict**

In [23]:
import numpy as np
from sklearn.model_selection import train_test_split

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_test, y_pred) for y_pred in gbrt.staged_predict(X_test)]
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

In [24]:
y_pred = gbrt_best.predict(X_test)
mse = mean_squared_error(y_test, y_pred)


In [25]:
mse

0.3252069149576089

In [26]:
r2_score(y_test, y_pred)

0.7542303013009619

**Using GradientBoosting with warm_start=True**

In [27]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
  gbrt.n_estimators = n_estimators
  gbrt.fit(X_train, y_train)
  y_pred = gbrt.predict(X_test)
  val_error = mean_squared_error(y_test, y_pred)
  if val_error < min_val_error:
    min_val_error = val_error
    error_going_up = 0
  else:
    error_going_up += 1
    if error_going_up == 5:
      break

In [28]:
val_error

0.32630697682059434

# Using XGBoost

In [29]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)


In [30]:
mse = mean_squared_error(y_test, y_pred)
mse

0.2161409990991321

# Exercie Solution

In [31]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)

X, y = mnist.data, mnist.target

print(X.shape, y.shape)

(70000, 784) (70000,)


In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=42)

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import SVC

rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
ext_clf = ExtraTreeClassifier(random_state=42)
svm_clf = SVC(random_state=42)

rnd_clf.fit(X_train, y_train)
ext_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)


In [34]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[
        ('rnd', rnd_clf),
        ('ext', ext_clf),
        ('svm', svm_clf)
    ],
    voting='hard'
)

voting_clf.fit(X_train, y_train)

In [36]:
y_pred_rnd = rnd_clf.predict(X_test)
y_pred_ext = ext_clf.predict(X_test)
y_pred_svm = svm_clf.predict(X_test)

In [37]:
accuracy_score(y_test, y_pred_rnd)

0.9648571428571429

In [38]:
accuracy_score(y_test, y_pred_ext)

0.8257142857142857

In [39]:
accuracy_score(y_test, y_pred_svm)

0.975

In [41]:
accuracy_score(y_test, voting_clf.predict(X_test))

0.9688571428571429