# Machine Learning Modeling for Sow Productivity

Full Title: A Comparative Analysis of Machine Learning Classifiers for Modeling the Number of Liveborn Piglets

Ji Yang*, Mohsen Jafarikia*,†,‡, Patrick Gagnon§, Laurence Maignel†, Brent DeVries#, Julang Li*, Dan Tulpan*

*Department of Animal Biosciences, Centre for Genetic Improvement of Livestock, University of Guelph, Guelph, Ontario, N1G 2W1, Canada <br>
†Canadian Centre for Swine Improvement Inc., Ottawa, Ontario, K1V 0M7, Canada <br>
‡Department of Animal Science and Aquaculture, Dalhousie University, Truro, Nova Scotia, B2N 5E3, Canada <br>
§Le Centre de développement du porc du Québec Inc., Québec City, Québec, J4H 4E7, Canada <br>
#Hypor, Hendrix Genetics, Regina, Saskatchewan, S4N 0N7, Canada

Script Author: Ji Yang

## 1. Introduction

The objectives of this study are:
1. 

## 2. Materials and Methods

### 2.1 Materials

This study utilized two datasets: the 'CDPQ Dataset' and the 'Hypor Dataset'.

In [1]:
# -------------------------------------------------------------------
# Step 1. Define and preprocessing of the CDPQ and Hypor Datasets
# -------------------------------------------------------------------
import pandas as pd
import numpy as np
from utils import merge_consecutive_records, bin

# --- Preprocessing CDPQ Dataset ---
# Rename columns
class_labels = {0: 'Low', 1: 'Medium', 2: 'High'}
class_labels_int = list(class_labels.keys())
class_labels_str = list(class_labels.values())

def preprocess(df, dataset_name, class_labels_str):
    if dataset_name == "cdpq":
        df = df.rename(columns={
            "Breeding Weight": "Breeding BW",
            "Farrowing Weight": "Farrowing BW",
            "Weaning Weight": "Weaning BW",
            "Breeding Backfat": "Breeding BFT",
            "Farrowing Backfat": "Farrowing BFT",
            "Weaning Backfat": "Weaning BFT"
        })

        # Setting farm and rearranging columns
        df['Farm'] = 1
        df = df[["Sow ID", "Farm", "Parity", "Gestation Length", "Lactation Length", 
                "Stillborn", "Mummies", "Piglets Weaned", "Breeding BW", "Farrowing BW", 
                "Weaning BW", "Breeding BFT", "Farrowing BFT", "Weaning BFT", "Liveborn"]]

    elif dataset_name == "cdpq_subset":
        df['Farm'] = 1
        df = df[["Sow ID", "Farm", "Parity", "Gestation Length", "Lactation Length", 
                "Stillborn", "Mummies", "Piglets Weaned", "Liveborn"]]
        
    else:

        df['Farm'] = df['Farm'].astype('category').copy()
        df['Farm'] = df['Farm'].cat.codes + 1
        df = df[["Sow ID", "Farm", "Parity", "Gestation Length", "Lactation Length",
                "Stillborn", "Mummies", "Piglets Weaned", "Liveborn"]]
        
    df['Sow ID'] = df['Sow ID'].astype('object').copy()
    df['Farm'] = df['Farm'].astype('category').copy()
    df = merge_consecutive_records(df, 'Sow ID')

    

    df['Classification'] = bin(df['Liveborn (Next Parity)'], inner_quantiles=[0.25, 0.75], labels=class_labels_int)
    df["Classification"] = df["Classification"].astype('category').copy()
    df = df.dropna()

    X = df[df.columns[2:-2]]
    y = df[df.columns[-1]]
    group_col = 'Farm' if dataset_name == "hypor" else 'Sow ID'
    group = df[group_col]
    
    # --- Save the processed dataset ---
    processed_dataset_path = f"../processed_data/{dataset_name}_processed_dataset.csv"
    df.to_csv(processed_dataset_path, index=False)
    print(f"Processed dataset saved at: {processed_dataset_path}")
    
    return X, y, group

# Working file paths
def read(dataset_name):
    dataset_path = f"../raw_data/{dataset_name}_raw_dataset.xlsx"
    figure_directory = f"../figures/{dataset_name}/"

    try:
        df = pd.read_excel(dataset_path)
    except FileNotFoundError:
        print(f"Data file not found at: {dataset_path}")
    
    return df, figure_directory

In [2]:
# -------------------------------------------------------------------
# Step 2. Load the datasets
# -------------------------------------------------------------------

df_cdpq, _ = read("cdpq")
_X, _y, _group = preprocess(df_cdpq, "cdpq_subset", class_labels_str)

df_hypor, _ = read("hypor")
X, y, group = preprocess(df_hypor, "hypor", class_labels_str)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sow ID'] = df['Sow ID'].astype('object').copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Farm'] = df['Farm'].astype('category').copy()


Processed dataset saved at: ../processed_data/cdpq_subset_processed_dataset.csv
Processed dataset saved at: ../processed_data/hypor_processed_dataset.csv


In [3]:
# -------------------------------------------------------------------
# Step 3. Machine Learning Algorithm Definitions
# -------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from model import Estimator, EstimatorGlobalParameters

seed = 42
max_iter = int( 1E5 )
# Define the number of parallel jobs. Use a specific number instead of -1
# to prevent memory exhaustion on systems with many cores.
# Adjust this value based on your system's available RAM.
n_jobs = 1
model_params = EstimatorGlobalParameters(
    X, y, group, split_method='logo', 
    random_state=seed, directory=f"../figures/hypor_train_cdpq_test/", n_jobs=n_jobs
)

CLASSIFIERS = {
    "DT": DecisionTreeClassifier(random_state=seed),
    "KNN": KNeighborsClassifier(),
    "LR": LogisticRegression(random_state=seed, max_iter=max_iter, n_jobs=n_jobs),
    "MLP": MLPClassifier(random_state=seed, max_iter=max_iter),
    "RF": RandomForestClassifier(random_state=seed, n_jobs=n_jobs),
    "SGD": SGDClassifier(random_state=seed, max_iter=max_iter, n_jobs=n_jobs),
    "SVM": SVC(random_state=seed, max_iter=-1),
}

CLASSIFIER_PARAMS = {
    "DT":       {'classifier__class_weight': 'balanced', 'classifier__criterion': 'gini', 'classifier__max_depth': 3, 'classifier__splitter': 'random'},
    "KNN":      {'classifier__algorithm': 'ball_tree', 'classifier__n_neighbors': 13, 'classifier__p': 1, 'classifier__weights': 'uniform'},
    "LR":       {'classifier__C': np.float64(0.81), 'classifier__class_weight': None, 'classifier__fit_intercept': True},
    "MLP":      {'classifier__activation': 'relu', 'classifier__alpha': 0.001, 'classifier__hidden_layer_sizes': (50, 50), 'classifier__learning_rate': 'constant', 'classifier__solver': 'adam'},
    "RF":       {'classifier__class_weight': None, 'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__n_estimators': 100},
    "SGD":      {'classifier__alpha': 0.01, 'classifier__class_weight': 'balanced', 'classifier__fit_intercept': True, 'classifier__loss': 'modified_huber', 'classifier__penalty': 'l1'},
    "SVM":      {'classifier__C': np.float64(1.41), 'classifier__class_weight': None, 'classifier__degree': 1, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}
}

In [4]:
# -------------------------------------------------------------------
# Step 4. Machine Learning Algorithms Pipeline
# -------------------------------------------------------------------
from sklearn.metrics import f1_score, balanced_accuracy_score, precision_score, recall_score, confusion_matrix
from functools import partial
from graphing import plot_confusion_matrices

# Suppress the specific UserWarning from pkg_resources that is triggered by multiprocessing.
# This warning is not actionable from within this codebase and clutters the output.
import warnings
warnings.filterwarnings("ignore", message="A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.", category=UserWarning)
warnings.filterwarnings("ignore", message="y_pred contains classes not in y_true", category=UserWarning)
warnings.filterwarnings("ignore", message="pkg_resources is deprecated as an API.", category=UserWarning)

guiding_scorer = 'f1_weighted'
scoring_functions = {
    'balanced_accuracy': balanced_accuracy_score,
    'precision_weighted': partial(precision_score, average='weighted', labels=class_labels_int),
    'recall_weighted': partial(recall_score, average='weighted', labels=class_labels_int),
    'f1_none': partial(f1_score, average=None, labels=class_labels_int),
    'f1_weighted': partial(f1_score, average='weighted', labels=class_labels_int),
    'confusion_matrix': partial(confusion_matrix, labels=class_labels_int)
}

print("--- Classifier Performance Evaluation ---")

OPT_CLASSIFIERS = {}
all_cms = {}
for clf_name, clf in CLASSIFIERS.items():
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', clf)
    ])

    pipe.set_params(**CLASSIFIER_PARAMS[clf_name])
    OPT_CLASSIFIERS[clf_name] = Estimator(pipe, clf_name, model_params)
    OPT_CLASSIFIERS[clf_name].fit()
    
    scores = OPT_CLASSIFIERS[clf_name].score(scoring_functions, X_test=_X, y_test=_y)
    print(clf_name)
    print(scores)
    
    all_cms[clf_name] = [scores['confusion_matrix']]

--- Classifier Performance Evaluation ---
DT
{'balanced_accuracy': 0.4248288282229396, 'precision_weighted': 0.42810545081943535, 'recall_weighted': 0.41776315789473684, 'f1_none': array([0.37974684, 0.44117647, 0.42225392]), 'f1_weighted': 0.41865973700255327, 'confusion_matrix': array([[135, 135,  83],
       [168, 225, 169],
       [ 55,  98, 148]])}
KNN
{'balanced_accuracy': 0.3692030343789107, 'precision_weighted': 0.4219894871688783, 'recall_weighted': 0.43338815789473684, 'f1_none': array([0.32163743, 0.55970696, 0.18276762]), 'f1_weighted': 0.3972914291788692, 'confusion_matrix': array([[110, 225,  18],
       [151, 382,  29],
       [ 70, 196,  35]])}
LR
{'balanced_accuracy': 0.35811559082459904, 'precision_weighted': 0.5247044722408114, 'recall_weighted': 0.4761513157894737, 'f1_none': array([0.17307692, 0.63114754, 0.02597403]), 'f1_weighted': 0.34837109679039685, 'confusion_matrix': array([[ 36, 316,   1],
       [ 21, 539,   2],
       [  6, 291,   4]])}
MLP
{'balanced_acc

In [5]:
from utils import ensemble

default_overfit = ['DT', 'KNN', 'RF']
optimized_overfit = ['KNN', 'RF']
OPT_CLASSIFIERS = ensemble(OPT_CLASSIFIERS, exclusion_list=optimized_overfit)

for clf_name in ['Voting', 'Stacking']:
    OPT_CLASSIFIERS[clf_name].fit()
    scores = OPT_CLASSIFIERS[clf_name].score(scoring_functions, X_test=_X, y_test=_y) 
    print(scores)
    
    all_cms[clf_name] = [scores['confusion_matrix']]
    
plot_confusion_matrices(
    all_cms,
    class_labels=class_labels,
    normalize='true', # Can be 'true', 'pred', or None
    figure_directory=model_params.directory
)

{'balanced_accuracy': 0.3690886506153384, 'precision_weighted': 0.45546720802312907, 'recall_weighted': 0.46710526315789475, 'f1_none': array([0.24267782, 0.60951189, 0.12921348]), 'f1_weighted': 0.3841317536974686, 'confusion_matrix': array([[ 58, 285,  10],
       [ 53, 487,  22],
       [ 14, 264,  23]])}
{'balanced_accuracy': 0.36742417175934255, 'precision_weighted': 0.47761608938887856, 'recall_weighted': 0.47368421052631576, 'f1_none': array([0.22123894, 0.62043796, 0.0952381 ]), 'f1_weighted': 0.374547815120289, 'confusion_matrix': array([[ 50, 300,   3],
       [ 36, 510,  16],
       [ 13, 272,  16]])}
Confusion matrix plot saved to: ../figures/hypor_train_cdpq_test/confusion_matrices/cm_summary_normalized_true_optimized.png
