In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/diabetes-prediction-b-1-fall-24-25/sample_submission.csv
/kaggle/input/diabetes-prediction-b-1-fall-24-25/train.csv
/kaggle/input/diabetes-prediction-b-1-fall-24-25/test.csv


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, BaggingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the training dataset
data = pd.read_csv('Ensemble_Reg_train.csv')

# Drop irrelevant columns and split features and target
X = data.drop(columns=['ID', 'y'])  # Remove 'ID' and target 'y'
y = data['y']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data for initial train-test validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)

# Define models
models = {
    "RandomForest": RandomForestRegressor(random_state=34),
    "SVR": SVR()
}

# Define hyperparameters for grid search
param_grid = {
    "RandomForest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [5, 10, 15, 100]
    },
    "SVR": {
        "C": [0.1, 1, 10],
        "epsilon": [0.01, 0.1, 0.2],
        "kernel": ['linear', 'rbf']
    }
}

# Perform grid search with cross-validation
best_estimators = {}
grid_search_results = {}
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_estimators[name] = grid_search.best_estimator_
    grid_search_results[name] = grid_search.cv_results_
    print(f"Best {name} model:", grid_search.best_params_)
    print(f"Best {name} RMSE:", -grid_search.best_score_)

# Create and evaluate three different ensemble models

# 1. Voting Regressor: A simple ensemble of RandomForest and SVR
voting_model = VotingRegressor(estimators=[('RandomForest', best_estimators['RandomForest']),
                                           ('SVR', best_estimators['SVR'])])
voting_model.fit(X_train, y_train)
y_pred_voting = voting_model.predict(X_test)
voting_rmse = np.sqrt(mean_squared_error(y_test, y_pred_voting))
print(f"Test RMSE for Voting Regressor: {voting_rmse}")

# 2. Bagging Regressor: A simple ensemble using Bagging with RandomForest as base model
bagging_model = BaggingRegressor(base_estimator=best_estimators['RandomForest'], n_estimators=10, random_state=34)
bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_test)
bagging_rmse = np.sqrt(mean_squared_error(y_test, y_pred_bagging))
print(f"Test RMSE for Bagging Regressor: {bagging_rmse}")

# 3. Stacking Regressor: A stacked ensemble of RandomForest and SVR, with a Linear Regression meta-model
stacking_model = StackingRegressor(estimators=[('RandomForest', best_estimators['RandomForest']),
                                               ('SVR', best_estimators['SVR'])],
                                   final_estimator=LinearRegression())
stacking_model.fit(X_train, y_train)
y_pred_stacking = stacking_model.predict(X_test)
stacking_rmse = np.sqrt(mean_squared_error(y_test, y_pred_stacking))
print(f"Test RMSE for Stacking Regressor: {stacking_rmse}")

# Evaluate the models on the test set using cross-validation
print("\nCross-validation results for the final models:")
cv_rmse_voting = cross_val_score(voting_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"Cross-validated RMSE for Voting Regressor: {-np.mean(cv_rmse_voting)}")

cv_rmse_bagging = cross_val_score(bagging_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"Cross-validated RMSE for Bagging Regressor: {-np.mean(cv_rmse_bagging)}")

cv_rmse_stacking = cross_val_score(stacking_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"Cross-validated RMSE for Stacking Regressor: {-np.mean(cv_rmse_stacking)}")

# Evaluate on the test set for submission (final prediction for competition)
test_data = pd.read_csv("Ensemble_Reg_test.csv")
X_test = test_data.drop(columns=['ID'])  # Drop 'ID' column for the test set

# Standardize test data using the same scaler
X_test = scaler.transform(X_test)

# Make predictions on the test data for all ensemble models
y_pred_voting = voting_model.predict(X_test)
y_pred_bagging = bagging_model.predict(X_test)
y_pred_stacking = stacking_model.predict(X_test)

# Choose the best model for submission (in this case, Voting Regressor)
submission = pd.DataFrame({
    "ID": test_data['ID'],  # Use the original 'ID' column from the test data
    "y": y_pred_voting  # Predicted values from Voting Regressor
})

# Save the submission DataFrame to a CSV file
submission.to_csv("submission.csv", index=False)

Best RandomForest model: {'max_depth': 5, 'n_estimators': 300}
Best RandomForest RMSE: 52.15356718063002
Best SVR model: {'C': 1, 'epsilon': 0.1, 'kernel': 'linear'}
Best SVR RMSE: 51.74632043951137
Test RMSE for Voting Regressor: 60.47695305771406




Test RMSE for Bagging Regressor: 62.364492222517
Test RMSE for Stacking Regressor: 61.44561232353492

Cross-validation results for the final models:
Cross-validated RMSE for Voting Regressor: 50.46663784943119




Cross-validated RMSE for Bagging Regressor: 52.12356842952043
Cross-validated RMSE for Stacking Regressor: 50.79648008661769
