In [None]:
# Practical 5: Regression Models on UCI Abalone Dataset

from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

In [None]:
 
# fetch dataset 
abalone = fetch_ucirepo(id=1) 
  
# data (as pandas dataframes) 
X = abalone.data.features 
le = LabelEncoder()
X['Sex'] = le.fit_transform(X['Sex'])
y = abalone.data.targets 
  
# metadata 
print(abalone.metadata) 
  
# variable information 
print(abalone.variables) 


{'uci_id': 1, 'name': 'Abalone', 'repository_url': 'https://archive.ics.uci.edu/dataset/1/abalone', 'data_url': 'https://archive.ics.uci.edu/static/public/1/data.csv', 'abstract': 'Predict the age of abalone from physical measurements', 'area': 'Biology', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Tabular'], 'num_instances': 4177, 'num_features': 8, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['Rings'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C55C7W', 'creators': ['Warwick Nash', 'Tracy Sellers', 'Simon Talbot', 'Andrew Cawthorn', 'Wes Ford'], 'intro_paper': None, 'additional_info': {'summary': 'Predicting the age of abalone from physical measurements.  The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Sex'] = le.fit_transform(X['Sex'])


In [12]:
# Convert to numpy arrays
X = X.to_numpy()
y = y.to_numpy().ravel()  # Flatten to 1D

scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [14]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{name}:")
    print(f"  MSE:  {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²:   {r2:.4f}")




Linear Regression:
  MSE:  5.0625
  RMSE: 2.2500
  R²:   0.5323

Ridge:
  MSE:  5.0628
  RMSE: 2.2501
  R²:   0.5323

Lasso:
  MSE:  5.5393
  RMSE: 2.3536
  R²:   0.4883

Decision Tree:
  MSE:  9.8002
  RMSE: 3.1305
  R²:   0.0947

Random Forest:
  MSE:  5.0941
  RMSE: 2.2570
  R²:   0.5294


In [15]:
# K-Fold Cross-Validation
print("\n" + "=" * 80)
print("K-Fold Cross-Validation (k=5)")
print("=" * 80)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
    mse_scores = -cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(mse_scores)
    r2_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2')
    
    print(f"\n{name}:")
    print(f"  Avg MSE:  {mse_scores.mean():.4f} (+/- {mse_scores.std():.4f})")
    print(f"  Avg RMSE: {rmse_scores.mean():.4f} (+/- {rmse_scores.std():.4f})")
    print(f"  Avg R²:   {r2_scores.mean():.4f} (+/- {r2_scores.std():.4f})")


K-Fold Cross-Validation (k=5)

Linear Regression:
  Avg MSE:  5.0780 (+/- 0.3990)
  Avg RMSE: 2.2517 (+/- 0.0878)
  Avg R²:   0.5036 (+/- 0.0513)

Ridge:
  Avg MSE:  5.0786 (+/- 0.4019)
  Avg RMSE: 2.2519 (+/- 0.0884)
  Avg R²:   0.5036 (+/- 0.0514)

Lasso:
  Avg MSE:  5.4516 (+/- 0.3623)
  Avg RMSE: 2.3336 (+/- 0.0772)
  Avg R²:   0.4678 (+/- 0.0427)

Decision Tree:
  Avg MSE:  8.9097 (+/- 0.5546)
  Avg RMSE: 2.9835 (+/- 0.0925)
  Avg R²:   0.1300 (+/- 0.0681)

Random Forest:
  Avg MSE:  4.6750 (+/- 0.4248)
  Avg RMSE: 2.1601 (+/- 0.0958)
  Avg R²:   0.5442 (+/- 0.0368)


Random Forest is performing the best as it has the lowest mean squared error and highest R square due to non-linear relationships between features.