<a href="https://colab.research.google.com/github/shahbhavya7/cattle_model/blob/main/cattle_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize

In [2]:
ds=pd.read_excel('cattle_analysis.xlsx')

In [None]:
ds.info()

In [None]:
correlation_matrix=ds.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
ds.columns

In [None]:
ds.isnull().sum()

In [None]:
ds.duplicated().sum()

In [3]:
ds=ds.drop(columns=['AnimalNo'])

In [4]:
X=ds.iloc[:,:-1]
y=ds.iloc[:,-1]

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
for col in X_train.columns:
  sns.boxplot(data=X_train,x=col)
  plt.show()
  print('\n')


In [6]:
def get_limits(X,lower=0.02,upper=0.98):
  limits={col:(np.quantile(X[col],lower),(np.quantile(X[col],upper))) for col in X.columns}
  return limits

In [8]:
def apply_limits(X,limits):
  X_win=X.copy()
  for col in X.columns:
    lower,upper=limits[col]
    X_win[col]=np.clip(X[col],lower,upper)
  return X_win

In [9]:
def get_manual_cap_limits(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    return lower_limit, upper_limit

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
class Winsorizer(BaseEstimator,TransformerMixin):
  def __init__(self,lower=0.02,upper=0.98):
    self.lower_quantile=lower
    self.upper_quantile=upper
    self.limits={}

  def fit(self,X,y=None):
    self.limits=get_limits(X,self.lower_quantile,self.upper_quantile)
    return self

  def transform(self,X):
    return apply_limits(X,self.limits)

In [17]:
class Manual_capper(BaseEstimator,TransformerMixin):
  def __init__(self,column):
    self.column=column
    self.limits=(None,None)

  def fit(self,X,y=None):
    self.limits=get_manual_cap_limits(X,self.column)
    return self

  def transform(self,X):
    X_new=X.copy()
    lower,upper=self.limits
    X_new[self.column]=np.clip(X_new[self.column],lower,upper)
    return X_new

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_new = X.copy()
        X_new['Rump_Wither_Ratio'] = X_new['Rumpheight'] / X_new['WHHeightAtWither']
        X_new['Length_Wither_Ratio'] = X_new['BLBodylengthcm'] / X_new['WHHeightAtWither']
        X_new = X_new.drop(columns=['Rumpheight','WHHeightAtWither','BLBodylengthcm'])
        return X_new

In [32]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def evaluate_regression(y_test, y_pred):
    # Calculate errors
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mean_y = np.mean(y_test)

    # Percentages
    rmse_percent = (rmse / mean_y) * 100
    mae_percent = (mae / mean_y) * 100

    # Quality assessment (combined RMSE% and R²)
    if rmse_percent < 5 and r2 > 0.85:
        quality = "Excellent"
    elif rmse_percent < 10 and r2 > 0.75:
        quality = "Good"
    elif rmse_percent < 15 and r2 > 0.6:
        quality = "Acceptable"
    else:
        quality = "Poor"

    # Print results
    print(f"RMSE: {rmse:.2f} kg ({rmse_percent:.1f}% of mean weight)")
    print(f"MAE: {mae:.2f} kg ({mae_percent:.1f}% of mean weight)")
    print(f"R²: {r2:.3f}")
    print(f"Model Quality: {quality}")

    return rmse, mae, r2, quality


In [38]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge,Lasso ,LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

pipeline = Pipeline([
    ('features',FeatureEngineer()),
    ('winsor', Winsorizer()),
    ('capper', Manual_capper('CannonBoneDiameter')),
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [39]:
evaluate_regression(y_test, y_pred)

RMSE: 34.74 kg (5.8% of mean weight)
MAE: 26.80 kg (4.5% of mean weight)
R²: 0.836
Model Quality: Good


(np.float64(34.73980699981818), 26.801153846153852, 0.8363324826736054, 'Good')

In [None]:
# for col in X_train_win.columns:
#   sns.boxplot(data=X_train_win,x=col)
#   plt.show()
#   print('\n')

In [None]:
# skew_result = X_win.skew(numeric_only=True).to_dict()
# print(skew_result)

In [None]:
# skew_result = y_win.skew(numeric_only=True)
# print(skew_result)

0.12408513676691647


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Ridge
ridge_param_grid = {'alpha': [0.1, 1.0, 10.0]}
ridge_grid_search = GridSearchCV(Ridge(), ridge_param_grid, scoring='neg_mean_squared_error', cv=5)
ridge_grid_search.fit(X_train, y_train)
print("Best parameters for Ridge:", ridge_grid_search.best_params_)

# Lasso
lasso_param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]}
lasso_grid_search = GridSearchCV(Lasso(max_iter=5000), lasso_param_grid, scoring='neg_mean_squared_error', cv=5)
lasso_grid_search.fit(X_train, y_train)
print("Best parameters for Lasso:", lasso_grid_search.best_params_)

# Linear Regression (no hyperparameters to tune, but still use CV for consistency)
linreg_param_grid = {}
linreg_grid_search = GridSearchCV(LinearRegression(), linreg_param_grid, scoring='neg_mean_squared_error', cv=5)
linreg_grid_search.fit(X_train, y_train)
print("Best parameters for Linear Regression (no tuning):", linreg_grid_search.best_params_)

# Random Forest
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, scoring='neg_mean_squared_error', cv=5)
rf_grid_search.fit(X_train, y_train)
print("Best parameters for Random Forest:", rf_grid_search.best_params_)

# XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
xgb_grid_search = GridSearchCV(
    XGBRegressor(random_state=42, n_jobs=-1, verbosity=0),
    xgb_param_grid,
    scoring='neg_mean_squared_error',
    cv=5
)
xgb_grid_search.fit(X_train, y_train)
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)

# LightGBM
lgbm_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [-1, 5, 10]
}
lgbm_grid_search = GridSearchCV(
    LGBMRegressor(random_state=42),
    lgbm_param_grid,
    scoring='neg_mean_squared_error',
    cv=5
)
lgbm_grid_search.fit(X_train, y_train)
print("Best parameters for LightGBM:", lgbm_grid_search.best_params_)

# CatBoost
cat_param_grid = {
    'iterations': [200, 500],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6, 8]
}
cat_grid_search = GridSearchCV(
    CatBoostRegressor(random_state=42, verbose=0),
    cat_param_grid,
    scoring='neg_mean_squared_error',
    cv=5
)
cat_grid_search.fit(X_train, y_train)
print("Best parameters for CatBoost:", cat_grid_search.best_params_)

# SVR
svr_param_grid = {
    'C': [0.1, 1.0, 10.0],
    'epsilon': [0.01, 0.1, 0.2],
    'kernel': ['rbf']
}
svr_grid_search = GridSearchCV(SVR(), svr_param_grid, scoring='neg_mean_squared_error', cv=5)
svr_grid_search.fit(X_train, y_train)
print("Best parameters for SVR:", svr_grid_search.best_params_)


In [None]:
best_models = {
    'Ridge': ridge_grid_search.best_estimator_,
    'Lasso':lasso_grid_search.best_estimator_,
    'LinearRegression':linreg_grid_search.best_estimator_,
    'Random Forest': rf_grid_search.best_estimator_,
    'XGboost': xgb_grid_search.best_estimator_,
    'Lightboost':lgbm_grid_search.best_estimator_,
    'Catboost': cat_grid_search.best_estimator_,
    'SVR': svr_grid_search.best_estimator_
}
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name}:")
    evaluate_regression(y_test, y_pred)

RMSE: 30.04 kg (5.0% of mean weight)
MAE: 22.97 kg (3.8% of mean weight)
R²: 0.866
Model Quality: Good
(np.float64(30.039672370500533), 22.9675643106626, 0.866380202946881, 'Good')