## 5. Model Building and Evaluation

### Import Libraries

In [None]:
import pandas as pd

from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

from xgboost import XGBRegressor
import optuna

import pickle

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Load data
df_train = pd.read_csv(r'datasets\train_dataset_reduced_features.csv')
df_test = pd.read_csv(r'datasets\test_dataset_reduced_features.csv')

In [None]:
# Define X and y 
X_train = df_train.drop(['Price'], axis=1)
X_test = df_test.drop(['Price'], axis=1)

y_train = df_train['Price']
y_test= df_test['Price']

We cun use Optuna to build an Elastic Net model

In [None]:
def objective(trial):
    # Define hyperparameter ranges for the Elastic Net model
    params = {
        'alpha': trial.suggest_loguniform('alpha', 1e-6, 10),
        'l1_ratio': trial.suggest_uniform('l1_ratio', 0, 1),
        'max_iter': trial.suggest_int('max_iter', 100, 1000, step=100),
        'tol': trial.suggest_loguniform('tol', 1e-6, 1e-2)
    }
    
    # Train the Elastic Net model using cross-validation and return the average R2 score
    en = ElasticNet(**params)
    scores = cross_val_score(en, X_train, y_train, cv=5, scoring='r2')
    r2_mean = scores.mean()
    return r2_mean

study1 = optuna.create_study(direction='maximize')
study1.optimize(objective, n_trials=200)

[32m[I 2023-05-02 18:36:03,220][0m A new study created in memory with name: no-name-c771ede6-559d-42c9-9316-cfda7e1ff721[0m
[32m[I 2023-05-02 18:36:03,300][0m Trial 0 finished with value: 0.24477882325158418 and parameters: {'alpha': 9.260458340095713, 'l1_ratio': 0.446819592249442, 'max_iter': 100, 'tol': 1.722207186724377e-06}. Best is trial 0 with value: 0.24477882325158418.[0m
[32m[I 2023-05-02 18:36:03,439][0m Trial 1 finished with value: 0.24705564993162313 and parameters: {'alpha': 4.2985242745985, 'l1_ratio': 0.664364683878444, 'max_iter': 600, 'tol': 0.0013194365276477567}. Best is trial 1 with value: 0.24705564993162313.[0m
[32m[I 2023-05-02 18:36:03,582][0m Trial 2 finished with value: 0.2806748058073057 and parameters: {'alpha': 0.002029455114660197, 'l1_ratio': 0.8747826913763881, 'max_iter': 1000, 'tol': 4.609198651399722e-05}. Best is trial 2 with value: 0.2806748058073057.[0m
[32m[I 2023-05-02 18:36:03,703][0m Trial 3 finished with value: 0.281478455054526

In [None]:
# Define random forest regressor with optimised hyperparameters
en = ElasticNet(alpha=study1.best_params['alpha'],
                            l1_ratio=study1.best_params['l1_ratio'],
                            max_iter=study1.best_params['max_iter'],
                            tol=study1.best_params['tol'],
                            random_state=42)

# Fit the regressor with the training data
en.fit(X_train, y_train)

# Make predictions on the test data
y_pred = en.predict(X_test)

# Evaluate the performance of the model using, mean absolute error, root mean squared error and R2 score
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae:.2f}')
print(f'Root Mean Squared Error: {rmse:.2f}')
print(f'R2 Score: {r2:.4f}')

Mean Absolute Error: 328543.60
Root Mean Squared Error: 1001395.31
R2 Score: 0.1898


In [None]:
scores_dict = {'Elastic Net' : [r2.round(4), rmse.round(2), mae.round(2)]}

We can use Optuna to tune our hyperparameters for a Random Forest model:

In [None]:
# Define objective function for hyperparameter optimisation
def objective(trial):
    # Define hyperparameters to be optimized
    param = {
        'n_estimators' : trial.suggest_int('n_estimators', 50, 100),
        'max_depth' : trial.suggest_int('max_depth', 2, 10),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1, 5)
    }
    
    # Create an XGBoost regressor model with the given hyperparameters
    model = RandomForestRegressor(**param)

    # Compute cross-validation scores
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

    # Calculate the mean R2 score across cross-validation folds
    return cv_scores.mean()

# Create an Optuna study and run the trials
study2 = optuna.create_study(direction='maximize') # Optimize for maximum R2 score
study2.optimize(objective, n_trials=200)

[32m[I 2023-05-02 18:36:26,276][0m A new study created in memory with name: no-name-0cc5ebcf-ef3b-4daf-a8e5-42f6811a8aee[0m
[32m[I 2023-05-02 18:36:26,666][0m Trial 0 finished with value: 0.4124262492150709 and parameters: {'n_estimators': 58, 'max_depth': 2, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.4124262492150709.[0m
[32m[I 2023-05-02 18:36:27,456][0m Trial 1 finished with value: 0.4823040684344712 and parameters: {'n_estimators': 94, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.4823040684344712.[0m
[32m[I 2023-05-02 18:36:28,686][0m Trial 2 finished with value: 0.47909509373174625 and parameters: {'n_estimators': 98, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.4823040684344712.[0m
[32m[I 2023-05-02 18:36:29,200][0m Trial 3 finished with value: 0.45695041205464754 and parameters: {'n_estimators': 66, 'max_depth': 4, 'min_samples_split'

Now we can fit the model with the optimised parameters:

In [None]:
# Define random forest regressor with optimised hyperparameters
rf = RandomForestRegressor(n_estimators=study2.best_params['n_estimators'],
                            max_depth=study2.best_params['max_depth'],
                            min_samples_split=study2.best_params['min_samples_split'],
                            min_samples_leaf=study2.best_params['min_samples_leaf'],
                            random_state=42)

# Fit the regressor with the training data
rf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test)

# Evaluate the performance of the model using, mean absolute error, root mean squared error and R2 score
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae:.2f}')
print(f'Root Mean Squared Error: {rmse:.2f}')
print(f'R2 Score: {r2:.4f}')

Mean Absolute Error: 179327.94
Root Mean Squared Error: 793464.51
R2 Score: 0.4913


In [None]:
scores_dict['Random Forest'] = [r2.round(4), rmse.round(2), mae.round(2)]

In [None]:
# Define objective function for hyperparameter optimisation
def objective(trial):

    # Define hyperparameters to be optimised
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 100),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'reg:squarederror',
        'random_state': 42
    }


    # Create an XGBoost regressor model with the given hyperparameters
    model = XGBRegressor(**param)

    # Compute cross-validation scores
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

    # Calculate the mean R2 score across cross-validation folds
    return cv_scores.mean()

# Create an Optuna study and run the trials
study3 = optuna.create_study(direction='maximize') # Optimise for maximum R2 score
study3.optimize(objective, n_trials=200)

[32m[I 2023-05-02 18:43:43,648][0m A new study created in memory with name: no-name-8361ccc9-b97d-45ac-a3a7-30d42d5e78fc[0m
[32m[I 2023-05-02 18:43:43,861][0m Trial 0 finished with value: 0.518163068391032 and parameters: {'n_estimators': 70, 'max_depth': 3, 'learning_rate': 0.09641954414819025, 'subsample': 0.5352329224834225, 'colsample_bytree': 0.9717340186456291, 'min_child_weight': 7}. Best is trial 0 with value: 0.518163068391032.[0m
[32m[I 2023-05-02 18:43:44,380][0m Trial 1 finished with value: 0.41301977992767885 and parameters: {'n_estimators': 94, 'max_depth': 8, 'learning_rate': 0.09844785735966585, 'subsample': 0.936288450464778, 'colsample_bytree': 0.8861638715512876, 'min_child_weight': 2}. Best is trial 0 with value: 0.518163068391032.[0m
[32m[I 2023-05-02 18:43:44,553][0m Trial 2 finished with value: 0.4949572708525089 and parameters: {'n_estimators': 78, 'max_depth': 2, 'learning_rate': 0.07327588889639614, 'subsample': 0.8613166660458385, 'colsample_bytree

In [None]:
# Define random forest regressor with optimized hyperparameters
xgb = XGBRegressor(
                            n_estimators = study3.best_params['n_estimators'],
                            max_depth = study3.best_params['max_depth'],
                            learning_rate = study3.best_params['learning_rate'],
                            subsample = study3.best_params['subsample'],
                            colsample_bytree = study3.best_params['colsample_bytree'],
                            min_child_weight = study3.best_params['min_child_weight'],
                            random_state=42
)

# Fit the regressor with the training data
xgb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb.predict(X_test)

# Evaluate the performance of the model using, mean absolute error, root mean squared error and R2 score
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae:.2f}')
print(f'Root Mean Squared Error: {rmse:.2f}')
print(f'R2 Score: {r2:.4f}')

Mean Absolute Error: 176508.32
Root Mean Squared Error: 753967.36
R2 Score: 0.5407


In [None]:
scores_dict['XGBoost'] = [r2.round(4), rmse.round(2), mae.round(2)]

We can compare our performance metrics below:

In [None]:
# Create a dataframe from the dictionary
scores_df = pd.DataFrame(scores_dict, index = ['r2', 'rmse', 'mae'])
scores_df

Unnamed: 0,Elastic Net,Random Forest,XGBoost
r2,0.1898,0.4913,0.5407
rmse,1001395.0,793464.51,753967.36
mae,328543.6,179327.94,176508.32


We can see that XGBoost slightly outperformed Random Forest and both significantly outperformed Elastic Net. 

We can save our model and our predicted values for further analysis:

In [None]:
# Save a dataframe of predictions and true values
prediction_comparison = pd.DataFrame(y_pred, y_test).rename(columns={'Price' : 'True Price', 0 : 'Predicted Price'}).reset_index()
prediction_comparison.to_csv(r'datasets\prediction_comparison.csv', index=False)

# Save trained model to file using pickle
with open(r'model\xgbmodel.pkl', 'wb') as f:
    pickle.dump(xgb, f)