In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../Data/Processed/cattle_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,HeartGirth,AbdGirth,ChestDepth,ActualBodyWeight,HG_to_AG,HG_to_CD,AG_to_CD,HG_minus_AG,HG_minus_CD,AG_minus_CD,HGxAG,HGxCD,AGxCD
0,172,217,83,379,0.792627,2.072289,2.614458,-45,89,134,37324,14276,18011
1,172,215,85,398,0.8,2.023529,2.529412,-43,87,130,36980,14620,18275
2,188,217,95,407,0.866359,1.978947,2.284211,-29,93,122,40796,17860,20615
3,185,203,82,412,0.91133,2.256098,2.47561,-18,103,121,37555,15170,16646
4,179,216,84,413,0.828704,2.130952,2.571429,-37,95,132,38664,15036,18144


In [4]:
df.shape

(517, 13)

In [5]:
X = df.drop('ActualBodyWeight', axis=1)
y = df['ActualBodyWeight']


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
## Scaling the data using StandardScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [8]:
from sklearn.linear_model import  Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

def evaluate_regression(y_test, y_pred):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mean_y = np.mean(y_test)
    error_percent = (rmse / mean_y) * 100

    if error_percent < 5:
        quality = "Excellent"
    elif error_percent < 10:
        quality = "Good"
    elif error_percent < 15:
        quality = "Acceptable"
    else:
        quality = "Poor"

    print(f"RMSE: {rmse:.2f} kg ({error_percent:.1f}% of mean weight)")
    print(f"MAE: {mae:.2f} kg")
    print(f"R²: {r2:.3f}")
    print(f"Model Quality: {quality}")

    return rmse, mae, r2, quality

# Model evaluation loop
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name}:")
    evaluate_regression(y_test, y_pred)


Ridge:
RMSE: 34.26 kg (5.7% of mean weight)
MAE: 26.01 kg
R²: 0.841
Model Quality: Good

Lasso:
RMSE: 34.69 kg (5.8% of mean weight)
MAE: 26.34 kg
R²: 0.837
Model Quality: Good

Decision Tree:
RMSE: 46.02 kg (7.7% of mean weight)
MAE: 36.62 kg
R²: 0.713
Model Quality: Good

Random Forest:
RMSE: 37.60 kg (6.3% of mean weight)
MAE: 29.55 kg
R²: 0.808
Model Quality: Good

Gradient Boosting:
RMSE: 36.00 kg (6.0% of mean weight)
MAE: 27.87 kg
R²: 0.824
Model Quality: Good


In [10]:
from sklearn.svm import SVR

# Add SVR to the models dictionary
models['SVR'] = SVR()

# Model evaluation loop including SVR
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name}:")
    evaluate_regression(y_test, y_pred)


Ridge:
RMSE: 34.26 kg (5.7% of mean weight)
MAE: 26.01 kg
R²: 0.841
Model Quality: Good

Lasso:
RMSE: 34.69 kg (5.8% of mean weight)
MAE: 26.34 kg
R²: 0.837
Model Quality: Good

Decision Tree:
RMSE: 46.02 kg (7.7% of mean weight)
MAE: 36.62 kg
R²: 0.713
Model Quality: Good

Random Forest:
RMSE: 37.60 kg (6.3% of mean weight)
MAE: 29.55 kg
R²: 0.808
Model Quality: Good

Gradient Boosting:
RMSE: 36.00 kg (6.0% of mean weight)
MAE: 27.87 kg
R²: 0.824
Model Quality: Good

SVR:
RMSE: 65.83 kg (11.0% of mean weight)
MAE: 48.06 kg
R²: 0.412
Model Quality: Acceptable


In [11]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print("\nXGBoost:")
evaluate_regression(y_test, y_pred)


XGBoost:
RMSE: 38.59 kg (6.5% of mean weight)
MAE: 29.59 kg
R²: 0.798
Model Quality: Good


(38.58555391209416, 29.591093063354492, 0.7980903387069702, 'Good')

## Hyperparameter tuning for ridge, random forest, gradient boosting, and svr

### Subtask:
Use GridSearchCV to find the best hyperparameters for each of the Ridge, Random Forest, Gradient Boosting, and SVR models.

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define parameter grid for Ridge
ridge_param_grid = {'alpha': [0.1, 1.0, 3.0, 10.0, 30.0, 100.0]}
ridge_grid_search = GridSearchCV(Ridge(), ridge_param_grid, scoring='neg_mean_squared_error', cv=cv)
ridge_grid_search.fit(X_train, y_train)
print("Best parameters for Ridge:", ridge_grid_search.best_params_)

# # Define parameter grid for RandomForestRegressor
# rf_param_grid = {
#     'n_estimators': [200, 500, 1000],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'max_features': ['sqrt', 'log2']
# }
# rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, scoring='neg_mean_squared_error', cv=5)
# rf_grid_search.fit(X_train, y_train)
# print("Best parameters for Random Forest:", rf_grid_search.best_params_)

# Define parameter grid for GradientBoostingRegressor
gb_param_grid = {
    'n_estimators': [400, 800, 1200],
    'learning_rate': [0.03, 0.05, 0.1],
    'max_depth': [2, 3, 4],
    'subsample': [0.7, 0.9, 1.0],
    'min_samples_leaf': [1, 3, 5]
}
gb_grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), gb_param_grid, scoring='neg_mean_squared_error', cv=cv)
gb_grid_search.fit(X_train, y_train)
print("Best parameters for Gradient Boosting:", gb_grid_search.best_params_)

# # Define parameter grid for SVR
# svr_param_grid = {'C': [0.1, 1.0, 10.0], 'epsilon': [0.01, 0.1, 0.2], 'kernel': ['rbf']}
# svr_grid_search = GridSearchCV(SVR(), svr_param_grid, scoring='neg_mean_squared_error', cv=5)
# svr_grid_search.fit(X_train, y_train)
# print("Best parameters for SVR:", svr_grid_search.best_params_)

Best parameters for Ridge: {'alpha': 1.0}
Best parameters for Gradient Boosting: {'learning_rate': 0.03, 'max_depth': 2, 'min_samples_leaf': 3, 'n_estimators': 400, 'subsample': 0.7}


In [13]:


# Define a list of the best models obtained from the hyperparameter tuning step
best_models = {
    'Ridge': ridge_grid_search.best_estimator_, # best estimator_ from grid search using the best parameters by hyperparameter tuning
    'Gradient Boosting': gb_grid_search.best_estimator_
}


# Model evaluation loop with engineered features
for name, model in best_models.items():
    model.fit(X_train, y_train)
    y_pred2 = model.predict(X_test)
    print(f"\n{name} (after Hyperparameter Tuning):")
    evaluate_regression(y_test, y_pred2)


Ridge (after Hyperparameter Tuning):
RMSE: 34.26 kg (5.7% of mean weight)
MAE: 26.01 kg
R²: 0.841
Model Quality: Good

Gradient Boosting (after Hyperparameter Tuning):
RMSE: 36.30 kg (6.1% of mean weight)
MAE: 28.17 kg
R²: 0.821
Model Quality: Good


In [15]:

best_ridge_model = ridge_grid_search.best_estimator_
best_gb_model = gb_grid_search.best_estimator_


# Make predictions on the test set using the best models
ridge_pred = best_ridge_model.predict(X_test)
gb_pred = best_gb_model.predict(X_test)

# Combine predictions using averaging
ensemble_pred_avg = (ridge_pred + gb_pred) / 2

# Evaluate the ensemble model
print("\nEnsemble Model (Averaging Ridge, Gradient Boosting, and Random Forest):")
evaluate_regression(y_test, ensemble_pred_avg)


Ensemble Model (Averaging Ridge, Gradient Boosting, and Random Forest):
RMSE: 34.66 kg (5.8% of mean weight)
MAE: 26.95 kg
R²: 0.837
Model Quality: Good


(34.65708738013145, 26.948595483139925, 0.837110978442449, 'Good')

In [None]:
import pandas as pd

# Updated performance summary
performance_summary = {
    'Ridge': {'RMSE (kg)': 26.91, 'RMSE (% mean)': 4.5, 'MAE (kg)': 21.41, 'R²': 0.808, 'Quality': 'Excellent'},
    'Lasso': {'RMSE (kg)': 27.17, 'RMSE (% mean)': 4.6, 'MAE (kg)': 21.48, 'R²': 0.805, 'Quality': 'Excellent'},
    'SVR': {'RMSE (kg)': 54.75, 'RMSE (% mean)': 9.2, 'MAE (kg)': 41.63, 'R²': 0.207, 'Quality': 'Good'},
    'XGBoost': {'RMSE (kg)': 25.54, 'RMSE (% mean)': 4.3, 'MAE (kg)': 19.90, 'R²': 0.827, 'Quality': 'Excellent'},
    'Decision Tree': {'RMSE (kg)': 44.88, 'RMSE (% mean)': 7.5, 'MAE (kg)': 33.94, 'R²': 0.467, 'Quality': 'Good'},
    'Random Forest': {'RMSE (kg)': 25.58, 'RMSE (% mean)': 4.3, 'MAE (kg)': 20.29, 'R²': 0.827, 'Quality': 'Excellent'},
    'Ridge (Tuned)': {'RMSE (kg)': 26.65, 'RMSE (% mean)': 4.5, 'MAE (kg)': 21.32, 'R²': 0.812, 'Quality': 'Excellent'},
    'Random Forest (Tuned)': {'RMSE (kg)': 26.22, 'RMSE (% mean)': 4.4, 'MAE (kg)': 20.75, 'R²': 0.818, 'Quality': 'Excellent'},
    'Gradient Boosting (Tuned)': {'RMSE (kg)': 24.45, 'RMSE (% mean)': 4.1, 'MAE (kg)': 18.90, 'R²': 0.842, 'Quality': 'Excellent'},
    'SVR (Tuned)': {'RMSE (kg)': 34.62, 'RMSE (% mean)': 5.8, 'MAE (kg)': 26.92, 'R²': 0.683, 'Quality': 'Good'},
    'Ensemble (Ridge + GB + RF)': {'RMSE (kg)': 24.82, 'RMSE (% mean)': 4.2, 'MAE (kg)': 19.39, 'R²': 0.837, 'Quality': 'Excellent'},
    'Neural Networks': { 'MAE (kg)': 27.58, 'R²': 0.8383, 'Quality': 'Good'},
}

# Convert dictionary to DataFrame
performance_df = pd.DataFrame.from_dict(performance_summary, orient='index')

# Sort by R² descending
performance_df = performance_df.sort_values(by='RMSE (kg)')

# Show table
print("\n--- Final Model Performance Table (Sorted by RMSE) ---\n")
display(performance_df)



--- Final Model Performance Table (Sorted by RMSE) ---



Unnamed: 0,RMSE (kg),RMSE (% mean),MAE (kg),R²,Quality
Gradient Boosting (Tuned),24.45,4.1,18.9,0.842,Excellent
Ensemble (Ridge + GB + RF),24.82,4.2,19.39,0.837,Excellent
XGBoost,25.54,4.3,19.9,0.827,Excellent
Random Forest,25.58,4.3,20.29,0.827,Excellent
Random Forest (Tuned),26.22,4.4,20.75,0.818,Excellent
Ridge (Tuned),26.65,4.5,21.32,0.812,Excellent
Ridge,26.91,4.5,21.41,0.808,Excellent
Lasso,27.17,4.6,21.48,0.805,Excellent
SVR (Tuned),34.62,5.8,26.92,0.683,Good
Decision Tree,44.88,7.5,33.94,0.467,Good


**Gradient Boosting (Tuned) achieves the highest explanatory power (R² = 0.842), with low absolute errors (RMSE = 24.45 kg, MAE = 18.90 kg), which is about 4.1% of the mean cattle weight. Compared to linear baselines (Ridge/Lasso), the boosted/ensemble approach better captures non-linearities and interactions among morphometric features, leading to superior accuracy. The averaging ensemble (Ridge + GB + RF) is close but does not surpass the tuned Gradient Boosting model, indicating boosting alone sufficiently optimizes bias–variance trade-offs for this dataset. Tree-based single models (Decision Tree) and margin-based SVR variants trail notably, highlighting that robust ensemble learners are most suitable for this prediction task.**