In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the Diabetes dataset
diabetes = load_diabetes()
diabetes_df = pd.DataFrame(data=np.c_[diabetes['data'], diabetes['target']],
                            columns=np.append(diabetes['feature_names'], 'target'))


In [3]:
# Feature selection and feature engineering
features = diabetes['feature_names']

In [4]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(diabetes_df, test_size=0.2, random_state=42)


In [5]:
# Train a VotingRegressor model with hyperparameter tuning
def train_voting_regressor(X_train, y_train):
    # Define individual regressors
    reg1 = RandomForestRegressor(random_state=42)
    reg2 = GradientBoostingRegressor(random_state=42)
    reg3 = LinearRegression()

    # Hyperparameter tuning for RandomForestRegressor
    param_grid_rf = {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
    grid_rf = GridSearchCV(reg1, param_grid_rf, cv=3, scoring='neg_mean_squared_error')
    grid_rf.fit(X_train, y_train)
    best_rf = grid_rf.best_estimator_

    # Hyperparameter tuning for GradientBoostingRegressor
    param_grid_gb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5]}
    grid_gb = GridSearchCV(reg2, param_grid_gb, cv=3, scoring='neg_mean_squared_error')
    grid_gb.fit(X_train, y_train)
    best_gb = grid_gb.best_estimator_

    # Create a VotingRegressor
    voting_regressor = VotingRegressor(estimators=[('rf', best_rf), ('gb', best_gb), ('lr', reg3)])
    voting_regressor.fit(X_train, y_train)
    return voting_regressor

In [6]:
# Evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

In [7]:
# Making Predictions with test data
def make_predictions(model, X_test, features):
    predictions = model.predict(X_test)
    results_df = pd.DataFrame({'Actual': test_data['target'].values, 'Predicted': predictions})
    return results_df

In [8]:
# Preprocess the training data
X_train, y_train = train_data[features], train_data['target']

In [9]:
# Train a VotingRegressor model
voting_regressor = train_voting_regressor(X_train, y_train)

In [10]:
# Evaluate the model
X_test, y_test = test_data[features], test_data['target']
mse, r2 = evaluate_model(voting_regressor, X_test, y_test)
print(f'Mean Squared Error of the model on the test set: {mse:.2f}')
print(f'R2 Score of the model on the test set: {r2:.2f}')

Mean Squared Error of the model on the test set: 2780.82
R2 Score of the model on the test set: 0.48


In [11]:
# Make predictions with test data
results_df = make_predictions(voting_regressor, X_test, features)
print('\nTest Data Predictions:')
print(results_df)


Test Data Predictions:
    Actual   Predicted
0    219.0  146.480651
1     70.0  181.877911
2    202.0  148.123559
3    230.0  271.076975
4    111.0  118.813435
..     ...         ...
84   153.0   96.518567
85    98.0   74.212735
86    37.0   86.185781
87    63.0   73.132624
88   184.0  162.048681

[89 rows x 2 columns]
