# General Preamble Code

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display

# Additional Import Code for dataset C (PART 1)

In [4]:
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing(as_frame=True)
california_housing.frame.head()
X_1 = california_housing.data 
y_1 = california_housing.target

# Additional Import Code for dataset W (PART 2)

In [5]:
from ucimlrepo import fetch_ucirepo 
wine_quality = fetch_ucirepo(id=186) 
X_2 = wine_quality.data.features 
y_2 = wine_quality.data.targets['quality']

# Part 1

## Question 1: Baseline Model: Train a RandomForestRegressor with n_estimators=100. What is the R-squared (R2) score on the test set?

In [6]:
print("############ Assignment 5 PART 1 Question 1 BEGIN ############")
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score 

# Train a RandomForestRegressor with n_estimators=100
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split (X_1, y_1, test_size=0.25, random_state=0)
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=0) 
rf_regressor.fit(X_1_train, y_1_train)

# What is the R-squared on test set?
y_1_pred = rf_regressor.predict(X_1_test)
r2_q1 = r2_score(y_1_test, y_1_pred)

print("RandomForestRegressor with n_estimators=100")
print(f"R-squared (test set): {r2_q1:.4f}")
print("############ Assignment 5 PART 1 Question 1 END ############")

############ Assignment 5 PART 1 Question 1 BEGIN ############
RandomForestRegressor with n_estimators=100
R-squared (test set): 0.7942
############ Assignment 5 PART 1 Question 1 END ############


## Question 2: Number of Trees: The n_estimators parameter defines the number of trees in the forest. Train two additional models: one with n_estimators=5 and another with n_estimators=500. How does the R2 score change as the number of trees increases? Why does a random forest generally not overfit by simply adding more trees?

In [7]:
print("############ Assignment 5 PART 1 Question 2 BEGIN ############")
# Train two additional models: one with n_estimators=5 and another with n_estimators=500
rf_regressor_5 = RandomForestRegressor(n_estimators=5, random_state=0) 
rf_regressor_5.fit(X_1_train, y_1_train)

rf_regressor_500 = RandomForestRegressor(n_estimators=500, random_state=0) 
rf_regressor_500.fit(X_1_train, y_1_train)

# How does R2 score change as number of trees increases
y_1_pred_5 = rf_regressor_5.predict(X_1_test)
r2_q2_5 = r2_score(y_1_test, y_1_pred_5)
print("RandomForestRegressor with n_estimators=5")
print(f"R-squared (test set): {r2_q2_5:.4f}")

y_1_pred_500 = rf_regressor_500.predict(X_1_test)
r2_q2_500 = r2_score(y_1_test, y_1_pred_500)
print("\nRandomForestRegressor with n_estimators=500")
print(f"R-squared (test set): {r2_q2_500:.4f}")

print("We see that as the number of trees increases, the R2 score increases (0.7578 -> 0.7955 when going from n_estimators=5 -> 500) ")

# Why does random forest not overfit (WIP; double check this explanation)
print("\nRandom forests generally don't overfit from adding more trees because each tree only looks at part of the data whether it be through looking at some of the observations (bootstrapping) or some of the features (max_features). So, each individual tree doesn't just memorize the entire training dataset. They each learn a different part of it, allowing their outputs to better generalize")
print("############ Assignment 5 PART 1 Question 2 END ############")

############ Assignment 5 PART 1 Question 2 BEGIN ############
RandomForestRegressor with n_estimators=5
R-squared (test set): 0.7578

RandomForestRegressor with n_estimators=500
R-squared (test set): 0.7955
We see that as the number of trees increases, the R2 score increases (0.7578 -> 0.7955 when going from n_estimators=5 -> 500) 

Random forests generally don't overfit from adding more trees because each tree only looks at part of the data whether it be through looking at some of the observations (bootstrapping) or some of the features (max_features). So, each individual tree doesn't just memorize the entire training dataset. They each learn a different part of it, allowing their outputs to better generalize
############ Assignment 5 PART 1 Question 2 END ############


## Question 3: Feature Sub-sampling: The power of random forests comes from decorrelating the trees. The max_features parameter controls this. Train a model with max_features=0.5 (using 50% of features for each tree) and compare its performance to a model with max_features=None (which is equivalent to a Bagging Regressor). Which performs better, and why does this feature sub-sampling often lead to a more robust model?

In [None]:
print("############ Assignment 5 PART 1 Question 3 BEGIN ############")
# Train a model with max_features=0.5 and a model with max_features=None
rf_regressor_50f = RandomForestRegressor(max_features=0.5, random_state=0) 
rf_regressor_50f.fit(X_1_train, y_1_train)
y_1_pred_50f = rf_regressor_50f.predict(X_1_test)
r2_q3_50f = r2_score(y_1_test, y_1_pred_50f)

rf_regressor_nonef = RandomForestRegressor(max_features=None, random_state=0) 
rf_regressor_nonef.fit(X_1_train, y_1_train)
y_1_pred_nonef = rf_regressor_nonef.predict(X_1_test)
r2_q3_nonef = r2_score(y_1_test, y_1_pred_nonef)
        # essentially the same model as from question 1

# Compare their performances, which performs better
print("RandomForestRegressor with max_features=0.5")
print(f"R-squared (test set): {r2_q3_50f:.4f}")
print("\nRandomForestRegressor with max_features=None")
print(f"R-squared (test set): {r2_q3_nonef:.4f}")

print("Based on R-squared, the model that performed better was the RandomForestRegressor where max_features=0.5.")

# Why does feature sub-sampling lead to more robust model (WIP)
print("\nFeature sub-sampling often leads to a more robust model because it reduces correlation among individual trees in the forest. " \
"When trees are less correlated, their errors cancel each other out when aggregating predictions, leading to improved generalization performance. " \
"Also by considering different subsets of features at each split, the model can capture diverse patterns in the data which helps in reducing overfitting")
print("############ Assignment 5 PART 1 Question 3 END ############")

############ Assignment 5 PART 1 Question 3 BEGIN ############
RandomForestRegressor with max_features=0.5
R-squared (test set): 0.8073

RandomForestRegressor with max_features=None
R-squared (test set): 0.7942
Based on R-squared, the model that performed better was the RandomForestRegressor where max_features=0.5.

Feature sub-sampling often leads to a more robust model because...
############ Assignment 5 PART 1 Question 3 END ############


## Question 4: Comparison to a Single Tree: Train a single DecisionTreeRegressor with no depth constraints on the same data. How does its R2 score compare to your best RandomForestRegressor? Explain conceptually why an ensemble of trees (Random Forest) typically outperforms a single, complex tree.

In [9]:
print("############ Assignment 5 PART 1 Question 4 BEGIN ############")
# Train a single DecisionTreeRegressor with no depth constraints on the same data
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor(random_state=0)
dt_regressor.fit(X_1_train, y_1_train)

# How does its R2 score compare to your best RandomForestRegressor?
y_1_pred_dt = dt_regressor.predict(X_1_test)
r2_q4 = r2_score(y_1_test, y_1_pred_dt)
print("Single DecisionTreeRegressor")
print(f"R-squared (test set): {r2_q4:.4f}")

best_r2 = max(r2_q1, r2_q2_5, r2_q2_500, r2_q3_50f, r2_q3_nonef)
print("\nBest RandomForestRegressor")
print(f"R-squared (test set): {best_r2:.4f}")

# Why does ensemble of trees outperform single tree (WIP; double-check this explanation)
print("The ensemble outperformed the single tree because a single tree will grow too complex and overfit to the data. By creating multiple different trees from the same dataset, we can reduce overfitting and reduce variance, improving our model's performance.")
print("############ Assignment 5 PART 1 Question 4 END ############")

############ Assignment 5 PART 1 Question 4 BEGIN ############
Single DecisionTreeRegressor
R-squared (test set): 0.5838

Best RandomForestRegressor
R-squared (test set): 0.8073
The ensemble outperformed the single tree because a single tree will grow too complex and overfit to the data. By creating multiple different trees from the same dataset, we can reduce overfitting and reduce variance, improving our model's performance.
############ Assignment 5 PART 1 Question 4 END ############


# Part 2

## Question 1: Boosting: Train a GradientBoostingClassifier with n_estimators=100 on the combined wine dataset. Report its accuracy on the test set.

In [12]:
print("############ Assignment 5 PART 2 Question 1 BEGIN ############")
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = wine_quality.data.features
y = wine_quality.data.targets['quality']
y_binary = (y >= 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.3, random_state=42, stratify=y_binary)

gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=0)
gb_classifier.fit(X_train, y_train)

y_pred = gb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"accuracy: {accuracy:.4f}")
print("############ Assignment 5 PART 2 Question 1 END ############")

############ Assignment 5 PART 2 Question 1 BEGIN ############
accuracy: 0.8441
############ Assignment 5 PART 2 Question 1 END ############


## Question 2: Boosting vs. Bagging: How does the accuracy of the GradientBoostingClassifier compare to a RandomForestClassifier (a bagging method) with the same n_estimators? Explain the fundamental difference in how boosting and bagging build their sequential vs. parallel ensembles.

In [None]:
print("############ Assignment 5 PART 2 Question 2 BEGIN ############")
from sklearn.ensemble import RandomForestClassifier

print("############ Assignment 5 PART 2 Question 2 END ############")

## Question 3: Hard Voting Ensemble (Stacking): Create a VotingClassifier that combines three different base models: a LogisticRegression(max_iter=2000), a DecisionTreeClassifier(max_depth=5), and a KNeighborsClassifier(n_neighbors=7). Use the default voting='hard'.
- Report the accuracy of this ensemble model.
- Is the ensemble's accuracy higher than the accuracy of each of the three individual models run separately? Why might this be the case?


In [None]:
print("############ Assignment 5 PART 2 Question 3 BEGIN ############")
from sklearn.ensemble import VotingClassifier

print("############ Assignment 5 PART 2 Question 3 END ############")

## Question 4: Soft Voting Ensemble: Change the voting parameter in your VotingClassifier to 'soft'. This requires all estimators to have a predict_proba method. How does the accuracy of soft voting compare to hard voting? Explain the mechanical difference between these two voting strategies.


In [None]:
print("############ Assignment 5 PART 2 Question 4 BEGIN ############")

print("############ Assignment 5 PART 2 Question 4 END ############")