In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV


In [27]:
df = pd.read_csv('D:/606/RoadMaintenance/Main/filtered_data.csv')

In [28]:
# Records from 2013 to 2020 → Training data (df_recent)
df_train = df[df['YEAR_RECOR'].isin([2013, 2014, 2015, 2016, 2017, 2018, 2019])]

# Records from 2021 and 2022 → Testing data (df_past)
df_test = df[df['YEAR_RECOR'].isin([2020, 2022])]

In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score


target = 'IRI_VN'
features = ['YEAR_RECOR', 'AADT_VN', 'curval', 'tmiles', 'tons', 'value', 'IS_IMPROVED', 'SPEED_LIMI', 'BEGIN_POIN', 'END_POINT', 'THROUGH_LA', 'SECTION_NUM']

# Create training and testing sets
X_train, y_train = df_train[features], df_train[target]
X_test, y_test = df_test[features], df_test[target]

# --- Evaluation Function ---
def evaluate_model(predictions, y_test):
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return mse, r2

# --- Random Forest Regressor Section ---
print("\n--- Random Forest Regressor ---")
random_forest_regressor = RandomForestRegressor(random_state=42, n_estimators=80, max_features='sqrt', bootstrap=True, max_samples=0.5, max_depth=25)
random_forest_regressor.fit(X_train, y_train)
rf_predictions = random_forest_regressor.predict(X_test)
rf_train_predictions = random_forest_regressor.predict(X_train)
rf_mse, rf_r2 = evaluate_model(rf_predictions, y_test)
rf_train_mse, rf_train_r2 = evaluate_model(rf_train_predictions, y_train)
print(f"MSE: {rf_mse:.4f}, R^2: {rf_r2:.4f}")
print(f"Train MSE: {rf_train_mse:.4f}, Train R^2: {rf_train_r2:.4f}")



--- Random Forest Regressor ---
MSE: 354.3478, R^2: 0.6027
Train MSE: 91.9327, Train R^2: 0.8953


In [30]:
# --- Gradient Boosting Regressor Section ---
print("\n--- Gradient Boosting Regressor ---")
gradient_boosting_regressor = GradientBoostingRegressor(random_state=42)
gradient_boosting_regressor.fit(X_train, y_train)
gb_predictions = gradient_boosting_regressor.predict(X_test)
gb_mse, gb_r2 = evaluate_model(gb_predictions, y_test)
print(f"MSE: {gb_mse:.4f}, R^2: {gb_r2:.4f}")



--- Gradient Boosting Regressor ---
MSE: 418.0456, R^2: 0.5313


In [31]:
'''max_depth=6, 
                                 min_child_weight=1, 
                                 gamma=0, 
                                 colsample_bytree=1, 
                                 reg_alpha=0, 
                                 reg_lambda=1,
                                 subsample=0.5'''

'max_depth=6, \n                                 min_child_weight=1, \n                                 gamma=0, \n                                 colsample_bytree=1, \n                                 reg_alpha=0, \n                                 reg_lambda=1,\n                                 subsample=0.5'

In [32]:
# --- XGBoost Regressor Section ---
print("\n--- XGBoost Regressor ---")
xgboost_regressor = XGBRegressor(random_state=42, 
                                 n_estimators=10, 
                                 learning_rate=0.3,)
xgboost_regressor.fit(X_train, y_train)
xgboost_predictions = xgboost_regressor.predict(X_test)
xgboost_mse, xgboost_r2 = evaluate_model(xgboost_predictions, y_test)
print(f"MSE: {xgboost_mse:.4f}, R^2: {xgboost_r2:.4f}")


--- XGBoost Regressor ---
MSE: 440.1356, R^2: 0.5065


In [33]:
# --- Voting Regressor Section ---
print("\n--- Voting Regressor ---")
voting_regressor = VotingRegressor(estimators=[('rf', random_forest_regressor), ('gb', gradient_boosting_regressor), ('xgb', xgboost_regressor)])
voting_regressor.fit(X_train, y_train)
voting_predictions = voting_regressor.predict(X_test)
voting_mse, voting_r2 = evaluate_model(voting_predictions, y_test)
print(f"MSE: {voting_mse:.4f}, R^2: {voting_r2:.4f}")


--- Voting Regressor ---
MSE: 383.6152, R^2: 0.5699


In [34]:
# --- Store Results ---
model_evaluations = {
    "Models": ["Random Forest", "Gradient Boosting", "XGBoost", "Voting Regressor"],
    "MSE": [rf_mse, gb_mse, xgboost_mse, voting_mse],
    "R^2": [rf_r2, gb_r2, xgboost_r2, voting_r2],
}

results_df = pd.DataFrame(model_evaluations)

# --- Print Results ---
print("\nEnsemble Model Evaluations:")
print(results_df)


Ensemble Model Evaluations:
              Models         MSE       R^2
0      Random Forest  354.347782  0.602715
1  Gradient Boosting  418.045624  0.531299
2            XGBoost  440.135559  0.506532
3   Voting Regressor  383.615193  0.569902


In [35]:
# Evaluate models on training data (to check bias)
rf_train_predictions = random_forest_regressor.predict(X_train)
gb_train_predictions = gradient_boosting_regressor.predict(X_train)
voting_train_predictions = voting_regressor.predict(X_train)
xgboost_train_predictions = xgboost_regressor.predict(X_train)

# Compute MSE & R^2 for training data
rf_train_mse, rf_train_r2 = evaluate_model(rf_train_predictions, y_train)
gb_train_mse, gb_train_r2 = evaluate_model(gb_train_predictions, y_train)
voting_train_mse, voting_train_r2 = evaluate_model(voting_train_predictions, y_train)
xgboost_train_mse, xgboost_train_r2 = evaluate_model(xgboost_train_predictions, y_train)

# Compute bias-variance tradeoff
bias_variance_df = pd.DataFrame({
    "Models": ["Random Forest", "Gradient Boosting", "Voting Regressor","XGBoost"],
    "Train MSE": [rf_train_mse, gb_train_mse, voting_train_mse, xgboost_train_mse],
    "Test MSE": [rf_mse, gb_mse, voting_mse, xgboost_mse],
    "Train R^2": [rf_train_r2, gb_train_r2, voting_train_r2, xgboost_train_r2],
    "Test R^2": [rf_r2, gb_r2, voting_r2, xgboost_r2],
})

print("/nBias-Variance Tradeoff Analysis:/n")
print(bias_variance_df)

/nBias-Variance Tradeoff Analysis:/n
              Models   Train MSE    Test MSE  Train R^2  Test R^2
0      Random Forest   91.932687  354.347782   0.895264  0.602715
1  Gradient Boosting  333.379044  418.045624   0.620191  0.531299
2   Voting Regressor  191.219559  383.615193   0.782149  0.569902
3            XGBoost  238.397507  440.135559   0.728401  0.506532


In [36]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

In [37]:
from sklearn.model_selection import GridSearchCV

param_grid_rf = {
'n_estimators': [50, 60, 70, 80, 90, 100],
'max_depth': [None, 5, 10, 15, 20, 25],
'min_samples_split': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1],
'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1],
'max_features': ['sqrt', 'log2', None, 1, 2, 3, 4, 5, 6]

}



param_grid_gb = {

'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200],
'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0],
'max_depth': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'min_samples_split': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1],
'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.1]

}



param_grid_xgb = {

'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200],
'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0],
'max_depth': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'min_child_weight': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'gamma':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'reg_alpha':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'reg_lambda': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25]

}

# # Initialize the models
# rf = RandomForestRegressor(random_state=42)
# gb = GradientBoostingRegressor(random_state=42)
# xgb = XGBRegressor(random_state=42)

# # Initialize GridSearchCV
# grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2, scoring= ['neg_mean_squared_error','r2'], refit='r2')
# grid_search_gb = GridSearchCV(estimator=gb, param_grid=param_grid_gb, cv=5, n_jobs=-1, verbose=2, scoring=['neg_mean_squared_error', 'r2'], refit='r2')
# #grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=2, scoring=['neg_mean_squared_error', 'r2'], refit='r2')

# # Fit the models
# grid_search_rf.fit(X_train, y_train)
# grid_search_gb.fit(X_train, y_train)
# #grid_search_xgb.fit(X_train, y_train)

# # Get the best models
# best_rf = grid_search_rf.best_estimator_
# best_gb = grid_search_gb.best_estimator_
# #best_xgb = grid_search_xgb.best_estimator_

# # Make predictions using the best models
# rf_predictions = best_rf.predict(X_test)
# gb_predictions = best_gb.predict(X_test)
# #xgboost_predictions = best_xgb.predict(X_test)

# # Evaluate the models
# rf_mse, rf_r2 = evaluate_model(rf_predictions, y_test)
# gb_mse, gb_r2 = evaluate_model(gb_predictions, y_test)
# #xgboost_mse, xgboost_r2 = evaluate_model(xgboost_predictions, y_test)

# # Print results
# print(f"Best Random Forest: {grid_search_rf.best_params_}")
# print(f"Best Gradient Boosting: {grid_search_gb.best_params_}")
# #print(f"Best XGBoost: {grid_search_xgb.best_params_}")
# print(f"Random Forest MSE: {rf_mse}, R²: {rf_r2}")
# print(f"Gradient Boosting MSE: {gb_mse}, R²: {gb_r2}")
# #print(f"XGBoost MSE: {xgboost_mse}, R²: {xgboost_r2}")


In [38]:
#Bias-Variance Tradeoff
# Evaluate models on training data (to check bias)
rf_train_predictions = random_forest_regressor.predict(X_train)
gb_train_predictions = gradient_boosting_regressor.predict(X_train)
voting_train_predictions = voting_regressor.predict(X_train)
xgboost_train_predictions = xgboost_regressor.predict(X_train)

# Compute MSE & R^2 for training data
rf_train_mse, rf_train_r2 = evaluate_model(rf_train_predictions, y_train)
gb_train_mse, gb_train_r2 = evaluate_model(gb_train_predictions, y_train)
voting_train_mse, voting_train_r2 = evaluate_model(voting_train_predictions, y_train)
xgboost_train_mse, xgboost_train_r2 = evaluate_model(xgboost_train_predictions, y_train)

# Compute bias-variance tradeoff
bias_variance_df = pd.DataFrame({
    "Models": ["Random Forest", "Gradient Boosting", "Voting Regressor","XGBoost"],
    "Train MSE": [rf_train_mse, gb_train_mse, voting_train_mse, xgboost_train_mse],
    "Test MSE": [rf_mse, gb_mse, voting_mse, xgboost_mse],
    "Train R^2": [rf_train_r2, gb_train_r2, voting_train_r2, xgboost_train_r2],
    "Test R^2": [rf_r2, gb_r2, voting_r2, xgboost_r2],
})

print("/nBias-Variance Tradeoff Analysis:/n")
print(bias_variance_df)

/nBias-Variance Tradeoff Analysis:/n
              Models   Train MSE    Test MSE  Train R^2  Test R^2
0      Random Forest   91.932687  354.347782   0.895264  0.602715
1  Gradient Boosting  333.379044  418.045624   0.620191  0.531299
2   Voting Regressor  191.219559  383.615193   0.782149  0.569902
3            XGBoost  238.397507  440.135559   0.728401  0.506532


In [39]:
''''max_depth': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'min_child_weight': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'gamma':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'reg_alpha':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'reg_lambda': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25]'''

"'max_depth': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],\n'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],\n'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],\n'min_child_weight': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],\n'gamma':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],\n'reg_alpha':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],\n'reg_lambda': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25]"

In [60]:
param_grid_xgb = {

'n_estimators': [50, 80, 100],
'learning_rate': [0.005, 0.01, 0.04],
'max_depth': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'min_child_weight': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'gamma':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'reg_alpha':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25],
'reg_lambda': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25]
}

In [61]:
from sklearn.model_selection import RandomizedSearchCV

# Initialize the models
rf = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)

# Initialize RandomizedSearchCV
#random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid_rf, n_iter=162, cv=5, n_jobs=-1, verbose=2, random_state=42, scoring=['neg_mean_squared_error', 'r2'], refit='r2')
#random_search_gb = RandomizedSearchCV(estimator=gb, param_distributions=param_grid_gb, n_iter=162, cv=5, n_jobs=-1, verbose=2, random_state=42, scoring=['neg_mean_squared_error', 'r2'], refit='r2')
random_search_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid_xgb, n_iter=1000, cv=5, n_jobs=-1, verbose=2, random_state=42, scoring=['neg_mean_squared_error', 'r2'], refit='r2')

# Fit the models
#random_search_rf.fit(X_train, y_train)
#random_search_gb.fit(X_train, y_train)
random_search_xgb.fit(X_train, y_train)

# Get the best models
#best_rf = random_search_rf.best_estimator_
#best_gb = random_search_gb.best_estimator_
best_xgb = random_search_xgb.best_estimator_

# Make predictions using the best models
#rf_predictions = best_rf.predict(X_test)
#gb_predictions = best_gb.predict(X_test)
xgboost_predictions = best_xgb.predict(X_test)

# Evaluate the models
#rf_mse, rf_r2 = evaluate_model(rf_predictions, y_test)
#gb_mse, gb_r2 = evaluate_model(gb_predictions, y_test)
xgboost_mse, xgboost_r2 = evaluate_model(xgboost_predictions, y_test)

# Print results
#print(f"Best Random Forest: {random_search_rf.best_params_}")
#print(f"Best Gradient Boosting: {random_search_gb.best_params_}")
print(f"Best XGBoost: {random_search_xgb.best_params_}")
#print(f"Random Forest MSE: {rf_mse}, R²: {rf_r2}")
#print(f"Gradient Boosting MSE: {gb_mse}, R²: {gb_r2}")
print(f"XGBoost MSE: {xgboost_mse}, R²: {xgboost_r2}")


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Best XGBoost: {'subsample': 1.0, 'reg_lambda': 7, 'reg_alpha': 2, 'n_estimators': 100, 'min_child_weight': 20, 'max_depth': 1, 'learning_rate': 0.005, 'gamma': 5, 'colsample_bytree': 0.1}
XGBoost MSE: 907.0047607421875, R²: -0.01690816879272461
