In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Import all required models
from sklearn.linear_model import LinearRegression, Ridge 
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, StackingRegressor


# Summary of the Story 

- Part A: My baseline DecisionTree was too simple (high bias) and underfit the data.


- Part B (Bagging): I tried to fix it with Bagging, but it failed. This proves Bagging is for variance reduction, not bias reduction.
- Part B (Boosting): I then tried Boosting, which is a bias-reduction technique, and it worked spectacularly, cutting the error in half.


- Part C (Stacking): My Stacking model performed almost identically to the Boosting model, as its other components were too weak to add value, proving that a stack is often only as strong as its best base learner.


- Final (Part D): Therefore, the Gradient Boosting Regressor was the best-performing model, as this problem was one of high bias, not high variance.

# Prepare the Data

In [2]:

# --- 1. Data Loading and Preprocessing ---

# Load the hourly dataset
try:
    df = pd.read_csv("hour.csv")
except FileNotFoundError:
    print("Error: 'hour.csv' not found. Please ensure the file is in the same directory.")
    exit()

# Feature Engineering per assignment
df = df.drop(columns=['instant', 'dteday', 'casual', 'registered'])
TARGET = 'cnt'
categorical_features = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']
numerical_features = ['temp', 'atemp', 'hum', 'windspeed']

X = df.drop(columns=[TARGET])
y = df[TARGET]


In [3]:

# --- 2. Time-Series Train-Test Split ---
test_size = 0.2
test_split_index = int(len(X) * (1 - test_size))

X_train = X.iloc[:test_split_index]
y_train = y.iloc[:test_split_index]
X_test = X.iloc[test_split_index:]
y_test = y.iloc[test_split_index:]

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


Training set size: 13903
Test set size: 3476


In [4]:

# --- 3. Create Preprocessing Pipeline ---
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# preform one hot encoding on categorical features

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)


# Baseline Models

In [5]:

# --- 4. Part A: Baseline Models ---
print(f"\n--- Part A: Baseline Models ---")

# Baseline 1: Linear Regression
pipeline_lr = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])
pipeline_lr.fit(X_train, y_train)
preds_lr = pipeline_lr.predict(X_test)
rmse_lr = np.sqrt(mean_squared_error(y_test, preds_lr))
print(f"Baseline Linear Regression RMSE: {rmse_lr:.4f}")

# Baseline 2: Decision Tree Regressor
# max_depth is 6 per assignment spec 
pipeline_dt = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=6, random_state=42))
])
pipeline_dt.fit(X_train, y_train)
preds_dt = pipeline_dt.predict(X_test)
rmse_dt = np.sqrt(mean_squared_error(y_test, preds_dt))
print(f"Baseline Decision Tree (max_depth=6) RMSE: {rmse_dt:.4f}")

if rmse_dt < rmse_lr:
    best_baseline_rmse = rmse_dt
    print(f"Best Baseline Model: Decision Tree (RMSE: {best_baseline_rmse:.4f})")
else:
    best_baseline_rmse = rmse_lr
    print(f"Best Baseline Model: Linear Regression (RMSE: {best_baseline_rmse:.4f})")




--- Part A: Baseline Models ---
Baseline Linear Regression RMSE: 133.8354
Baseline Decision Tree (max_depth=6) RMSE: 158.5538
Best Baseline Model: Linear Regression (RMSE: 133.8354)


# Bagging and Boosting

In [11]:

# --- 5. Part B: Bagging and Boosting ---
print(f"\n--- Part B: Ensemble Models ---")

# Bagging Regressor
# CORRECTED: estimator now uses the baseline DT (max_depth=6) 
pipeline_bagging = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', BaggingRegressor(
        estimator=DecisionTreeRegressor(max_depth=6, random_state=42), 
        n_estimators= 250, 
        random_state=42, 
        n_jobs=-1
    ))
])
pipeline_bagging.fit(X_train, y_train)
preds_bagging = pipeline_bagging.predict(X_test)
rmse_bagging = np.sqrt(mean_squared_error(y_test, preds_bagging))
print(f"Bagging Regressor (using max_depth=6 trees) RMSE: {rmse_bagging:.4f}")

# Gradient Boosting Regressor
# TUNED: Improved hyperparameters for a stronger model
pipeline_boosting = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', GradientBoostingRegressor(
        n_estimators=300, 
        learning_rate=0.05, 
        max_depth=7, 
        random_state=42
    ))
])
pipeline_boosting.fit(X_train, y_train)
preds_boosting = pipeline_boosting.predict(X_test)
rmse_boosting = np.sqrt(mean_squared_error(y_test, preds_boosting))
print(f"Gradient Boosting (Tuned) Regressor RMSE: {rmse_boosting:.4f}")



--- Part B: Ensemble Models ---
Bagging Regressor (using max_depth=6 trees) RMSE: 154.9973
Gradient Boosting (Tuned) Regressor RMSE: 82.3893


>>Bagging vs. Single Decision Tree (Variance Reduction)

__Baseline Decision Tree (max_depth=6) RMSE: 158__

__Bagging Regressor (with max_depth=6 trees) RMSE: 154__


The bagging technique was not effective in this scenario. The RMSE improved by a negligible amount (158 â†’ 154).

This result perfectly demonstrates the core concept of bagging: it is a variance-reduction technique.  It works by averaging the predictions of many diverse models, which is excellent for smoothing out high-variance models (like deep, overfit decision trees).

However, the assignment required a max_depth=6 tree, which, for this complex dataset, is a high-bias (underfit) model. It's too simple to capture the data's patterns. Bagging a collection of models that are all systematically wrong in the same way (i.e., they all have high bias) does not fix the underlying bias. We simply get an average of many poor predictions. 

__It is highly likely that the bagging models performance would improve if we used overfit descisison trees (depth 10-15 )  as the base models for the bagging classifier__

>> Boosting vs. Baseline & Bagging (Bias Reduction)

__Gradient Boosting Regressor RMSE: 82.3893__

__Best Baseline (Linear Regression) RMSE: 133.8354__

Bagging Regressor RMSE: 155.4459

Thus, the GradientBoostingRegressor (RMSE: 82.39) achieved a dramatically better result than both the best single model (Linear Regression, RMSE: 133.84) and the Bagging ensemble (RMSE: 155.45).

This strongly supports the hypothesis that boosting targets bias reduction. While Bagging trains models independently, Boosting is a sequential process. Each new tree is explicitly trained to correct the errors (the residuals or bias) of the previous trees. It effectively turns a collection of "weak learners" (like the shallow trees) into a single, powerful "strong learner" by systematically attacking the model's bias. This is why it performed so well on this high-bias problem.

# Stacking 

In [None]:


# --- 6. Part C: Stacking ---
print(f"\n--- Part C: Stacking Model ---")

# Define the Level-0 (Base) Learners
# We must create a separate pipeline for KNN to include preprocessing
pipeline_knn = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', KNeighborsRegressor(n_neighbors= 10)) # Using k=10 as a default
])

# Note: The Bagging and Boosting pipelines already include preprocessing.
# When stacking pipelines, StackingRegressor will call fit/predict on them,
# so the base learners *are* the full pipelines.
base_learners = [
    ('knn', pipeline_knn),
    ('bagging', pipeline_bagging),
    ('boosting', pipeline_boosting)
]

# Define the Level-1 (Meta) Learner
meta_learner = Ridge()

# Create the Stacking Regressor
# Because the base learners are now pipelines,I  didn't  put the stacking regressor inside another pipeline & 
# set passthrough=True so the meta-learner gets the predictions from the base pipelines directly.
stacking_regressor = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1,
    passthrough=True 
)

# Train and evaluate Stacking
# fit the StackingRegressor directly on the raw X_train, y_train
# It will handle preprocessing internally via its base-learner pipelines
stacking_regressor.fit(X_train, y_train)
preds_stacking = stacking_regressor.predict(X_test)
rmse_stacking = np.sqrt(mean_squared_error(y_test, preds_stacking))
print(f"Stacking Regressor RMSE: {rmse_stacking:.4f}")




--- Part C: Stacking Model ---
Stacking Regressor RMSE: 83.0934


>> Principle of Stacking 

Stacking (or Stacked Generalization) is a two-level ensemble method designed to combine the strengths of multiple diverse models. 


- Level 0 (Base Learners): A set of different models (e.g., KNN, Bagging, Boosting, as required by the assignment PS) are trained on the main training dataset

- Level 1 (Meta-Learner): Instead of using the original data, a new model (the "meta-learner," in this case, Ridge Regression ) is trained. Its input features are the predictions generated by the Level 0 models

# Final Analysis

The meta-learner's job is to learn the optimal way to combine the base learners' predictions. It learns from the data which models to "trust" in which situations. For example, it might learn that the Boosting model is highly accurate but tends to over-predict at peak hours, and it can use the KNN model's prediction to help correct for that. By learning the strengths, weaknesses, and correlations between its base learners, the meta-learner creates a final prediction that is (ideally) more accurate than any single one of its components

In [18]:
# --- 7. Part D: Final Analysis (Store results) ---
print( "\n--- Part D: Final Results ---")
print("Model RMSE Comparison:")
print(f"1. Baseline (Linear Regression): {rmse_lr:.4f}" )
print(f"2. Baseline (Decision Tree):      {rmse_dt:.4f}")
print(f"3. Bagging Regressor:             {rmse_bagging:.4f}")
print(f"4. Gradient Boosting Regressor:   {rmse_boosting:.4f}")
print(f"5. Stacking Regressor:            {rmse_stacking:.4f}")




--- Part D: Final Results ---
Model RMSE Comparison:
1. Baseline (Linear Regression): 133.8354
2. Baseline (Decision Tree):      158.5538
3. Bagging Regressor:             154.9973
4. Gradient Boosting Regressor:   82.3893
5. Stacking Regressor:            83.0934


In [None]:
# --- Save results to CSV ---
try:
    results = {
        "model": [
            "Baseline (Linear Regression)",
            "Baseline (Decision Tree)",
            "Bagging Regressor",
            "Gradient Boosting Regressor",
            "Stacking Regressor",
        ],
        "rmse": [rmse_lr, rmse_dt, rmse_bagging, rmse_boosting, rmse_stacking],
    }
    df_results = pd.DataFrame(results)
    # round RMSE to 4 decimals for readability
    df_results["rmse"] = df_results["rmse"].map(lambda x: round(float(x), 4))

    output_path = "model_rmse_results.csv"
    df_results.to_csv(output_path, index=False)
    print(f"Saved RMSE results to '{output_path}'")
except NameError as e:
    print("Could not save RMSE results: some RMSE variable is not defined:", e)


Saved RMSE results to 'model_rmse_results.csv'


>> Best Model and Final Conclusion

__Best-Performing Model: The Gradient Boosting Regressor was the best-performing model, with the lowest RMSE of 82.3893. The Stacking Regressor (RMSE: 83.09) performed almost identically__

Explanation (Bias-Variance & Diversity):

The best ensemble (Gradient Boosting) massively outperformed the single model baseline (Linear Regression, RMSE: 133.84). This success is a clear lesson in the bias-variance trade-off.

1. __The Problem was High Bias__: The baseline models (Linear Regression and the shallow max_depth=6 Decision Tree) were both high-bias models. They were too simple to capture the complex, non-linear relationships in the hourly bike data. This is proven by their very high RMSE.

2. __Boosting Solved the Bias__: The GradientBoostingRegressor won because it is a bias-reduction technique. It directly and aggressively corrected the errors of the weak baseline models, resulting in a low-bias, low-variance final model.

3. __Why Stacking Didn't Win__: The Stacking model's performance was almost identical to the Boosting model's because its strongest component was the Boosting model. Its other two base learners (KNN and the high-bias Bagging model) were providing much weaker signals. The Ridge meta-learner likely learned that the safest and most accurate bet was to simply "trust" the Gradient Boosting model's prediction almost exclusively. It couldn't find a better combination, so it essentially just reproduced the result of its best member.