In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [9]:
data = pd.read_csv('HVfeature0.csv') 
y = data.iloc[:, 0]
X = data.iloc[:, 1:]

In [10]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
models_config = [
    {
        'name': 'Linear Regression',
        'model': Pipeline([('scaler', StandardScaler()), 
                          ('regressor', Ridge())]),
        'params': {
            'regressor__alpha': [0.01, 0.1, 1, 10, 100]
        }
    },
    {
        'name': 'Random Forest',
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
    },
    {
        'name': 'Gradient Boosting',
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5]
        }
    },
    {
        'name': 'Support Vector Machine',
        'model': Pipeline([('scaler', StandardScaler()), 
                          ('regressor', SVR())]),
        'params': {
            'regressor__C': [0.1, 1, 10],
            'regressor__epsilon': [0.01, 0.1, 0.2],
            'regressor__kernel': ['linear', 'rbf']
        }
    }
]

In [12]:
results = []

In [13]:
for config in models_config:
    print(f"In processing {config['name']}...")
    
    
    grid = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        scoring={
            'R2': 'r2',
            'MAE': 'neg_mean_absolute_error',
            'MSE': 'neg_mean_squared_error'
        },
        refit='MAE',
        cv=cv,
        n_jobs=-1
    )
    
    grid.fit(X, y)
    
    
    best_idx = grid.best_index_
    metrics = {
        'Model': config['name'],
        'R2': grid.cv_results_['mean_test_R2'][best_idx],
        'MAE': -grid.cv_results_['mean_test_MAE'][best_idx],
        'RMSE': np.sqrt(-grid.cv_results_['mean_test_MSE'][best_idx]),
        'Best Parameters': grid.best_params_
    }
    
    results.append(metrics)

In processing Linear Regression...
In processing Random Forest...
In processing Gradient Boosting...
In processing Support Vector Machine...


In [14]:
results_df = pd.DataFrame(results).sort_values('MAE')
print("\nModel performance comparison：")
print(results_df[['Model', 'R2', 'MAE', 'RMSE', 'Best Parameters']].to_string(index=False))


Model performance comparison：
                 Model       R2       MAE       RMSE                                                                 Best Parameters
     Gradient Boosting 0.920179 30.473691  64.245987                     {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
         Random Forest 0.921570 38.136582  63.644162                   {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}
     Linear Regression 0.851797 65.907466  87.709414                                                      {'regressor__alpha': 0.01}
Support Vector Machine 0.804268 70.040727 101.683798 {'regressor__C': 10, 'regressor__epsilon': 0.01, 'regressor__kernel': 'linear'}
