In [1]:
# ============================================
# STEP 1: Import required libraries
# ============================================

# Load the Iris dataset
from sklearn.datasets import load_iris

# Split data into train and test
from sklearn.model_selection import train_test_split

# Base model (Decision Tree)
from sklearn.tree import DecisionTreeClassifier

# Bagging algorithm
from sklearn.ensemble import BaggingClassifier

# To measure performance
from sklearn.metrics import accuracy_score


# ============================================
# STEP 2: Load the Iris dataset
# ============================================

# Iris dataset contains:
# - 150 flowers
# - 4 features (sepal length, sepal width, petal length, petal width)
# - 3 classes (Setosa, Versicolor, Virginica)

iris = load_iris()

# X = features (measurements of flowers)
X = iris.data

# y = target labels (flower type as numbers 0,1,2)
y = iris.target


# ============================================
# STEP 3: Split the data
# ============================================

# We split data so the model learns from one part
# and is tested on unseen data

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,     # 30% data for testing
    random_state=42    # Fix randomness so results don't change
)


# ============================================
# STEP 4: Define the base model
# ============================================

# This is the model Bagging will COPY many times
# Decision Trees are chosen because:
# - They learn fast
# - They overfit easily
# - Bagging fixes overfitting (high variance)

base_model = DecisionTreeClassifier()


# ============================================
# STEP 5: Create the Bagging model
# ============================================

bagging_model = BaggingClassifier(

    # estimator:
    # This tells Bagging WHICH model to repeat
    # Here: Decision Tree
    estimator=base_model,

    # n_estimators:
    # How many models (trees) to create
    # Think: 50 independent decision-makers
    n_estimators=50,

    # bootstrap:
    # True means "sampling with replacement"
    # Each tree gets a slightly different version of training data
    # Some rows repeat, some are missing
    bootstrap=True,

    # random_state:
    # Fixes randomness so results are reproducible
    random_state=42
)


# ============================================
# STEP 6: Train the Bagging model
# ============================================

# What happens internally when we call fit():
# 1. 50 bootstrapped datasets are created
# 2. 50 decision trees are trained independently
# 3. All trees are stored inside bagging_model

bagging_model.fit(X_train, y_train)


# ============================================
# STEP 7: Make predictions
# ============================================

# Each of the 50 trees predicts a class
# Final prediction is decided by MAJORITY VOTING

y_pred_bagging = bagging_model.predict(X_test)


# ============================================
# STEP 8: Evaluate performance
# ============================================

accuracy = accuracy_score(y_test, y_pred_bagging)

print("Bagging Accuracy on Iris Dataset:", accuracy)


# ============================================
# IMPORTANT MEMORY RULE (READ THIS)
# ============================================

# Bagging =
# Same model (Decision Tree)
# + Different random data (bootstrapping)
# + Parallel training
# + Majority voting
#
# Purpose: Reduce overfitting (variance)


Bagging Accuracy on Iris Dataset: 1.0


In [3]:
single_model = DecisionTreeClassifier(random_state=42)
single_model.fit(X_train, y_train)

y_pred_single = single_model.predict(X_test)
print("Single Tree Accuracy:on Iris Dataset", accuracy_score(y_test, y_pred_single))


Single Tree Accuracy:on Iris Dataset 1.0
