## Setup:

In [None]:
import pandas as pd
import os 
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

os.chdir('/media/roovedot/common/VSrootWorkspace/House-Price-Predictions-with-Random-Forest-Regression-Model')

housing = pd.read_csv('data/train_catH_naH_outlH.csv')
housing.info()

In [2]:
# Set up Target and Features
X = housing.drop(['SalePrice', 'Unnamed: 0', 'Unnamed: 0.1', 'Id'], axis=1)
y = housing['SalePrice']

## Feature Selection:

### Getting Correlations

In [None]:
# Get correlations of each Category with the Target
housing_corr = housing.corr()['SalePrice'].sort_values(ascending=False)  # Ordenar de mayor a menor

# Print all correlations in order
for var, corr in housing_corr.items():
    print(f"{var}: {corr:.3f}") # corr:.2f formats corr values to 2 decimals'''

### Dropping Correlations below 0.4 Absolute Value:

In [None]:
# Drop all Columns which have an Correlation below 0.4 Absolute Value
for col, corr in housing_corr.items():
    # If the absolute value of the correlation is less than 0.4 
    if abs(corr) < 0.4 and col in X.columns:
        # inplace=True ensures operating on the original Dataframe
        # axis=1 tells drop() method we are dropping columns and not rows
        X.drop(columns=col, axis=1, inplace=True)  # Drop Column

X.info()



### Recursive Feature Elimination (RFE) with Cross-Validation:

In [None]:
rf = RandomForestRegressor(random_state=42) # initialize model
kf = KFold(n_splits=5, shuffle=True, random_state=42) # Setup Cross-Validation

# RFECV will determine the optimal number of features
rfecv = RFECV(estimator=rf, step=1, cv=kf, scoring='neg_mean_absolute_error') #Setup RFE
print("Training Model...")
rfecv.fit(X,y) # Train Model

In [None]:
# Select the optimal features based on RFE
X_optimal = X[X.columns[rfecv.support_]]
X_optimal.info()
X_optimal.to_csv('data/optimalFeatures.csv')

## Testing the effects of feature selection:

In [None]:
kf = KFold(n_splits=5, shuffle=True)#, random_state=15) # Initialize cross-validation
X_full = housing.drop(columns=['SalePrice', 'Unnamed: 0', 'Unnamed: 0.1', 'Id'])

# Model 1: Random Forest with all features
rf_full = RandomForestRegressor()#random_state=13)
mae_full = -cross_val_score(rf_full, X_full, y, cv=kf, scoring='neg_mean_absolute_error')
print("Mean MAE with all features:", np.mean(mae_full))

In [4]:
#Quick Setup:
X_optimal = pd.read_csv('data/optimalFeatures.csv')

In [None]:
# Model 2: Random Forest with selected optimal features
rf_optimal = RandomForestRegressor()#random_state=42)
mae_optimal = -cross_val_score(rf_optimal, X_optimal, y, cv=kf, scoring='neg_mean_absolute_error')
print("Mean MAE with optimal features:", np.mean(mae_optimal))

## HyperParameter Tuning:

### With "optimal" Features:

In [None]:
# Define the parameter grid

# INITIAL PARAM GRID:
''' 
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
'''

param_grid = {
    'n_estimators': [650, 700, 800, 900, 1000],
    'max_depth': [25, 30, 40, 50],
    'min_samples_split': [1, 2, 3],
    'min_samples_leaf': [1],
    'max_features': ['sqrt']
}

# Set up GridSearchCV with cross-validation
rf_tuned = RandomForestRegressor(random_state=63) # Initialize Model
grid_search = GridSearchCV( # Setup Grid Search
    estimator=rf_tuned, # model
    param_grid=param_grid, # param grid
    cv=5, # nº of folds for cross-validation
    scoring='neg_mean_absolute_error', # measures performance by negative MAE (This is because GridSearch wants to find the MAXIMUM value of the metric. By making it negative, we assure it will find the LOWEST MAE possible)
    n_jobs=-1 # use all CPU cores possible
)

# Initialize grid Search 
grid_search.fit(X_optimal, y)

# Get the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best MAE with optimal features:", -grid_search.best_score_)

**First Result:**  
Best parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}  
Best MAE with optimal features: 17608.179334058535  

**Second Result:**  
Best parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 700}  
Best MAE with optimal features: 17607.785862170535  

**Third Result:**  
Best parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 800}  
Best MAE with optimal features: 17593.4855390538  

### With All Features:

In [None]:
# Define the parameter grid

# INITIAL PARAM GRID:

'''param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}'''


param_grid = {
    'n_estimators': [850, 900, 950],
    'max_depth': [30,35,40],
    'min_samples_split': [5],
    'min_samples_leaf': [1],
    'max_features': ['sqrt']
}

# Set up GridSearchCV with cross-validation
rf_tuned = RandomForestRegressor()#random_state=42) # Initialize Model
grid_search = GridSearchCV( # Setup Grid Search
    estimator=rf_tuned, # model
    param_grid=param_grid, # param grid
    cv=5, # nº of folds for cross-validation
    scoring='neg_mean_absolute_error', # measures performance by negative MAE (This is because GridSearch wants to find the MAXIMUM value of the metric. By making it negative, we assure it will find the LOWEST MAE possible)
    n_jobs=-1 # use all CPU cores possible
)

# Initialize grid Search 
grid_search.fit(X_full, y)

# Get the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best MAE with optimal features:", -grid_search.best_score_)

**First Result:**  
Best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}  
Best MAE with optimal features: 16680.684353549375  

**Second Result:**  
Best parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 900}  
Best MAE with optimal features: 16635.980042080417  


## Final Model:

In [31]:
# Define the model with Optimal hyperparameters
finalModel = RandomForestRegressor(
    max_depth=30,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=900,
)

# Perform cross-validation to estimate MAE
kf = KFold(n_splits=5, shuffle=True)  # Cross-validation setup

#Train Model
mse_scores = -cross_val_score(finalModel, X_full, y, cv=kf, scoring='neg_mean_squared_error')

# Print the mean MAE across folds
print("Mean Squared Error with specified parameters on full features:", np.mean(mse_scores))

Mean Squared Error with specified parameters on full features: 859448941.9232883
