<a href="https://colab.research.google.com/github/rajdeepbanerjee-git/JNCLectures_Intro_to_ML/blob/main/Week11/Bagging_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [5]:
housing_df = pd.read_csv("/content/housing.csv")
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [7]:
def preprocess_data(df):
    """Clean and prepare housing data."""
    df = df.dropna()
    df = pd.get_dummies(df, columns=['ocean_proximity'])
    X = df.drop('median_house_value', axis=1).values
    y = df['median_house_value'].values
    return X, y

# Load and preprocess data
X, y = preprocess_data(housing_df)
print(X.shape)
print(y.shape)

(20433, 13)
(20433,)


In [21]:
n_trees = 10
random_state = 42
rng = np.random.default_rng(random_state)

def create_bootstrap_samples(X, n_trees, sample_size, rng):
    """Create bootstrap and OOB index lists with a given sample size."""
    n_samples = X.shape[0]
    bootstrap_indices = []
    oob_indices = []

    for _ in range(n_trees):
        indices = rng.integers(0, n_samples, size=sample_size)
        unique_indices = set(indices)
        oob = [i for i in range(n_samples) if i not in unique_indices]
        bootstrap_indices.append(indices)
        oob_indices.append(oob)

    return bootstrap_indices, oob_indices

# Generate bootstrap samples
sample_size = int(0.8*X.shape[0])
bootstrap_indices, oob_indices = create_bootstrap_samples(X, n_trees, sample_size, rng)

In [22]:
def train_trees(X, y, bootstrap_indices, random_state=42):
    """Train decision trees on bootstrap samples."""
    trees = []
    for indices in bootstrap_indices:
        tree = DecisionTreeRegressor(random_state=random_state)
        tree.fit(X[indices], y[indices])
        trees.append(tree)
    return trees

# Train ensemble of trees
trees = train_trees(X, y, bootstrap_indices, random_state)

In [26]:
def compute_oob_predictions(X, y, trees, oob_indices):
    """Compute and average OOB predictions.

    oob_counts[i]: how many times sample i was left out (i.e., was OOB) across the ensemble.

    oob_predictions[i]: the sum of predictions from all trees where i was OOB.

    oob_predictions[i] / oob_counts[i]: average prediction across all trees where i was OOB.

    final_preds: collects these averaged OOB predictions.

    actuals: collects the corresponding true values y[i].

    """
    n_samples = X.shape[0]
    oob_predictions = [0.0] * n_samples
    oob_counts = [0] * n_samples

    for tree, oob in zip(trees, oob_indices): # can you tell me why is zip used here?
        for i in oob:
            pred = tree.predict(X[[i]])[0]
            oob_predictions[i] += pred
            oob_counts[i] += 1

    final_preds = []
    actuals = []
    for i in range(n_samples):
        if oob_counts[i] > 0:
            final_preds.append(oob_predictions[i] / oob_counts[i])
            actuals.append(y[i])
    return final_preds, actuals

# Get OOB predictions
final_preds, actuals = compute_oob_predictions(X, y, trees, oob_indices)

# Calculate OOB error
oob_mse = mean_squared_error(actuals, final_preds)
oob_rmse = np.sqrt(oob_mse)
print(f"Final OOB MSE and RMSE: {oob_mse}, {oob_rmse}")

Final OOB MSE and RMSE: 2909372302.0077424, 53938.59751613627


In [27]:
# Split the data into train and test sets (80-20 split)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Train a single decision tree on the training set
single_tree = DecisionTreeRegressor(random_state=random_state)
single_tree.fit(X_train, y_train)

# Predict on the test set
y_pred_test = single_tree.predict(X_test)

# Compute test MSE for single decision tree
single_tree_test_mse = mean_squared_error(y_test, y_pred_test)
single_tree_test_rmse = np.sqrt(single_tree_test_mse)
print(f"Single Decision Tree MSE and RMSE on test set: {single_tree_test_mse}, {single_tree_test_rmse} ")

Single Decision Tree MSE and RMSE on test set: 4777874266.550771, 69122.16913950813 


1. Now, use the above code, but take a smaller portion (len(X)/n) and see what happens.
2. Now if you increase the number of trees by the same proportion n (n*n_trees), does anything change?