In [35]:
# Required Libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [36]:
# Load your dataset
data = pd.read_csv('C:/Users/user/Desktop/MSc Folder/Masters Research/Modelling Data/Modelling data.csv')

# Features and target
# Adjust column names based on your actual data
X = data.drop(columns=['Compressive Strength'])
y = data['Compressive Strength']

print(data.head())

   Fly ash  Sodium Silicate  Sodium Hydroxide  Molarity  Coarse Aggregate  \
0    408.0            103.0              41.0         8            1294.0   
1    408.0            103.0              41.0         8            1294.0   
2    408.0            103.0              41.0         8            1294.0   
3    408.0            103.0              41.0         8            1294.0   
4    408.0            103.0              41.0        14            1294.0   

   Fine Aggregate  Curing Days  Curing Temperature  SS/SH Ratio  Extra Water  \
0           554.0          7.0                  30          2.0          0.0   
1           554.0          7.0                  45          2.0          0.0   
2           554.0          7.0                  75          2.0          0.0   
3           554.0          7.0                  90          2.0          0.0   
4           554.0          7.0                  30          2.0         22.5   

   Compressive Strength  
0                  43.0  
1   

In [37]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Feature scaling (if needed)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
# Define the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],      # Number of trees in the forest
    'max_depth': [10, 20, 30, None],     # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],     # Minimum number of samples to split a node
    'min_samples_leaf': [1, 2, 4],       # Minimum number of samples in a leaf
    'bootstrap': [True, False],          # Method to sample data points
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider when looking for the best split
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Train the model with GridSearchCV
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


1080 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
275 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\user\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamete

In [40]:
# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

Best Hyperparameters: {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}


In [41]:
# Train the model with best hyperparameters
best_rf_model = grid_search.best_estimator_

In [42]:
# Make predictions
y_train_pred = best_rf_model.predict(X_train)
y_test_pred = best_rf_model.predict(X_test)

In [43]:
# Compute metrics
mape = mean_absolute_percentage_error(y_test, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mae = mean_absolute_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
print(f"Root Mean Square Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² (Goodness of Fit): {r2:.2f}")

Mean Absolute Percentage Error (MAPE): 0.25%
Root Mean Square Error (RMSE): 7.21
Mean Absolute Error (MAE): 5.23
R² (Goodness of Fit): 0.76


In [45]:
# Convert predictions to a DataFrame for better visualization
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
print(predictions.head())

     Actual  Predicted
19    48.74  46.677933
42    24.00  32.261017
153   49.89  48.289467
78    24.00  26.435500
145   41.92  41.319000
