In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")

In [2]:

column_names = [
    'Sex', 'Length', 'Diameter', 'Height',
    'WholeWeight', 'ShuckedWeight', 'VisceraWeight', 'ShellWeight', 'Rings'
]

# Path to uploaded file
file_path = "abalone/abalone.data"

# Load the dataset with column names
abalone_df = pd.read_csv(file_path, header=None, names=column_names)

# Display the first few rows
abalone_df

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [4]:
# Load and preprocess data
column_names = [
    'Sex', 'Length', 'Diameter', 'Height',
    'WholeWeight', 'ShuckedWeight', 'VisceraWeight', 'ShellWeight', 'Rings'
]
file_path = "abalone/abalone.data"
df = pd.read_csv(file_path, header=None, names=column_names)
df = pd.get_dummies(df, columns=['Sex'], drop_first=True)

X = df.drop('Rings', axis=1)
y = df['Rings']

# Define base model
base_tree = DecisionTreeRegressor(random_state=1)

# Bagging Regressor setup
bagging = BaggingRegressor(estimator=base_tree, random_state=1)

# Grid of hyperparameters
param_grid = {
    'n_estimators': [50, 100],
    'max_samples': [0.7, 0.9, 1.0],
    'max_features': [0.7, 0.9, 1.0],
    'estimator__max_depth': [None, 5, 10]
}

# Grid search with cross-validation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

grid = GridSearchCV(estimator=bagging, param_grid=param_grid,
                    scoring='r2', cv=cv, n_jobs=-1, verbose=1)

# Fit grid search
grid.fit(X, y)

# Best model and score
print("Best R² Score: {:.3f}".format(grid.best_score_))
print("Best Parameters:", grid.best_params_)


Fitting 30 folds for each of 54 candidates, totalling 1620 fits
Fitting 30 folds for each of 54 candidates, totalling 1620 fits


Best R² Score: 0.560
Best Parameters: {'estimator__max_depth': 10, 'max_features': 0.7, 'max_samples': 0.7, 'n_estimators': 100}
Best R² Score: 0.560
Best Parameters: {'estimator__max_depth': 10, 'max_features': 0.7, 'max_samples': 0.7, 'n_estimators': 100}


In [7]:
# Define base model
base_tree = DecisionTreeRegressor(max_depth = 10, random_state=1)

# Define the model
model = BaggingRegressor(estimator=base_tree, n_estimators=100, max_features=0.7, max_samples=0.7, random_state=1)

# Set up cross-validation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Cross-validate R-squared scores
r2_scores = cross_val_score(model, X, y, scoring='r2', cv=cv, n_jobs=-1)

# Report
print("Cross-validation accuracy:", r2_scores.mean())

Cross-validation accuracy: 0.559771882090571
Cross-validation accuracy: 0.559771882090571
