In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
# @title 1. Install Libraries
!pip install deap openpyxl scikit-learn --quiet
print("Required libraries checked/installed.")

Required libraries checked/installed.


In [25]:
!pip install xgboost --quiet
print("XGBoost installation checked.")

XGBoost installation checked.


In [26]:
# @title 2. Import Libraries & Mount Google Drive
import pandas as pd
import numpy as np
import random
import pickle
import os
import time
import traceback # For detailed error printing

from google.colab import drive

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Import DEAP components after installation
try:
    from deap import base, creator, tools, algorithms
except ImportError:
    print("DEAP library not found after installation attempt. Please check installation.")
    raise SystemExit("DEAP required.")


print("Libraries imported.")

Libraries imported.


In [41]:
# @title 3. Define Data Path, Target Column, Threshold & Load Data

# === Configuration ===
# --- PATH TO YOUR *NEW* DATASET (with correct CCS score) ---
data_path = '/content/drive/MyDrive/MatchFound.xlsx' # <--- *** REPLACE THIS PATH ***
# --- NAME OF THE COLUMN WITH THE RAW CCS SCORE ---
target_score_column = 'Compatibility_Score' # <--- *** VERIFY/CHANGE THIS COLUMN NAME ***
# --- THRESHOLD FOR YES/NO CLASSIFICATION ---
compatibility_threshold = 20 # <--- *** VERIFY/CHANGE THIS THRESHOLD ***
# --- PATH TO SAVE FINAL MODEL COMPONENTS ---
# Create the directory for saving if it doesn't exist
model_save_directory = '/content/drive/MyDrive/MyModels' # Define directory
model_save_path = os.path.join(model_save_directory, 'cattle_predictor_v5.pkl') # Define full path
if not os.path.exists(model_save_directory):
    print(f"Creating save directory: {model_save_directory}")
    os.makedirs(model_save_directory)


print(f"Attempting to load data from: {data_path}")
if not os.path.exists(data_path):
    print(f"ERROR: File not found: {data_path}"); raise SystemExit("Dataset not found.")

try:
    df = pd.read_excel(data_path, engine='openpyxl')
    print(f"Dataset loaded successfully. Shape: {df.shape}")
    if target_score_column not in df.columns:
        print(f"ERROR: Target score column '{target_score_column}' not found!")
        print(f"Available columns: {df.columns.tolist()}")
        raise SystemExit("Target column missing.")
except Exception as e:
    print(f"Error loading XLSX: {e}"); raise SystemExit("Data loading failed.")


Attempting to load data from: /content/drive/MyDrive/MatchFound.xlsx
Dataset loaded successfully. Shape: (8000, 38)


In [42]:
# @title 4. Create Targets, Engineer Features & Define Final X/y

# Create Binary Classification Target
binary_target_column = 'Compatible_Class'
df[binary_target_column] = (df[target_score_column] >= compatibility_threshold).astype(int)

print(f"\nCreated binary target '{binary_target_column}' based on threshold >= {compatibility_threshold}")
print(df[binary_target_column].value_counts(normalize=True))

# Define Targets (y)
y_reg = df[target_score_column]
y_class = df[binary_target_column]

# Define Initial Features (X) - excluding targets and IDs
columns_to_exclude = [target_score_column, binary_target_column, 'Cow_ID', 'Bull_ID', 'Compatibility_Score', 'Compatible']
columns_to_exclude = [col for col in columns_to_exclude if col in df.columns]
X = df.drop(columns=columns_to_exclude)

print(f"\nInitial Features (X shape): {X.shape}")

# --- FEATURE ENGINEERING ---
print("\n--- Engineering New Features ---")

# Ensure required columns exist before creating new features
required_cols_for_eng = [
    'Cow_Age', 'Bull_Age', 'Cow_Weight', 'Bull_Weight', 'Cow_Height', 'Bull_Height',
    'Cow_Milk_Yield', 'Bull_Mother_Milk_Yield', 'Cow_Drought_Resistance',
    'Bull_Drought_Resistance', 'Cow_Health_Status', 'Bull_Health_Status',
    'Cow_Temperament', 'Bull_Temperament'
    # Add other columns if needed by your specific calculations below
]

missing_req_cols = [col for col in required_cols_for_eng if col not in X.columns]
if missing_req_cols:
    print(f"Warning: Missing required columns for feature engineering: {missing_req_cols}. Skipping related features.")
else:
    try:
        # Example 1: Age Difference
        X['FE_Age_Diff'] = abs(X['Cow_Age'] - X['Bull_Age'])

        # Example 2: Weight Difference % (handle division by zero)
        X['FE_Weight_Diff_Pct'] = np.where(
            X['Cow_Weight'] > 0,
            abs(X['Cow_Weight'] - X['Bull_Weight']) / X['Cow_Weight'] * 100,
            0 # Assign 0 if Cow_Weight is 0 or less
        )

        # Example 3: Height Difference % (handle division by zero)
        X['FE_Height_Diff_Pct'] = np.where(
            X['Cow_Height'] > 0,
            abs(X['Cow_Height'] - X['Bull_Height']) / X['Cow_Height'] * 100,
            0
        )

        # Example 4: Milk Yield Sum (handle potential NaNs if Milk Yield wasn't imputed yet - though it should be later)
        X['FE_Milk_Sum'] = X['Cow_Milk_Yield'].fillna(0) + X['Bull_Mother_Milk_Yield'].fillna(0)

        # Example 5: Drought Resistance Difference
        X['FE_Drought_Diff'] = abs(X['Cow_Drought_Resistance'] - X['Bull_Drought_Resistance'])

        # Example 6: Combined Health Status (simple sum)
        X['FE_Combined_Health'] = X['Cow_Health_Status'] + X['Bull_Health_Status']

        # Example 7: Temperament Interaction (Simple numeric encoding)
        def encode_temperament(row):
            cow_t = row['Cow_Temperament']
            bull_t = row['Bull_Temperament']
            if cow_t == 'Calm' and bull_t == 'Calm': return 0
            if (cow_t == 'Calm' and bull_t == 'Aggressive') or \
               (cow_t == 'Aggressive' and bull_t == 'Calm'): return 1
            if cow_t == 'Aggressive' and bull_t == 'Aggressive': return 2
            return 3 # For unknown/missing combinations

        X['FE_Temperament_Combo'] = X.apply(encode_temperament, axis=1)

        print("Added engineered features based on CCS logic.")
        print("New engineered features:", [col for col in X.columns if col.startswith('FE_')])

    except KeyError as e:
        print(f"Error during feature engineering (missing column?): {e}. Skipping feature engineering.")
    except Exception as e:
        print(f"An unexpected error occurred during feature engineering: {e}")


# --- Final Feature Definition ---
# X now includes the original features plus the engineered ones
print(f"\nFinal Features with Engineering (X shape): {X.shape}")
print(f"Regression Target (y_reg shape): {y_reg.shape}")
print(f"Classification Target (y_class shape): {y_class.shape}")


Created binary target 'Compatible_Class' based on threshold >= 20
Compatible_Class
1    0.76675
0    0.23325
Name: proportion, dtype: float64

Initial Features (X shape): (8000, 34)

--- Engineering New Features ---
Added engineered features based on CCS logic.
New engineered features: ['FE_Age_Diff', 'FE_Weight_Diff_Pct', 'FE_Height_Diff_Pct', 'FE_Milk_Sum', 'FE_Drought_Diff', 'FE_Combined_Health', 'FE_Temperament_Combo']

Final Features with Engineering (X shape): (8000, 41)
Regression Target (y_reg shape): (8000,)
Classification Target (y_class shape): (8000,)


In [43]:
# @title 5. Preprocessing Setup & Application

# Identify feature types
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nIdentified {len(numerical_features)} numerical features.")
print(f"Identified {len(categorical_features)} categorical features.")

# Define preprocessing steps (using pipelines)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]) # handle_unknown is important

# Create the preprocessor object
# Ensure remainder='passthrough' only if you intend to keep non-numeric/non-categorical columns unprocessed
# Usually, it's better to handle all columns explicitly. If X only contains num/cat, remainder='drop' is safer.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='drop') # Drop columns not specified as numerical or categorical


# Fit the preprocessor and transform the data *before* splitting
print("\nFitting preprocessor and transforming data...")
try:
    X_processed = preprocessor.fit_transform(X)
    print("Preprocessing complete.")
    # Get feature names after OneHotEncoding
    try:
        feature_names_out = preprocessor.get_feature_names_out()
        print(f"Total features after preprocessing: {len(feature_names_out)}")
    except Exception as e:
        print(f"Could not get feature names from preprocessor: {e}")
        num_processed_features = X_processed.shape[1]
        feature_names_out = [f"feature_{i}" for i in range(num_processed_features)]
        print(f"Using generic feature names. Total features: {num_processed_features}")

except Exception as e:
    print(f"Error during preprocessing fit_transform: {e}")
    traceback.print_exc()
    raise SystemExit("Preprocessing failed.")


# Split PREPROCESSED data
# Stratify ensures similar class distribution in train/test splits
X_train_proc, X_test_proc, y_train_reg, y_test_reg, y_train_class, y_test_class = train_test_split(
    X_processed, y_reg, y_class, test_size=0.25, random_state=42, stratify=y_class
)

print(f"\nData split after preprocessing:")
print(f"X_train_proc shape: {X_train_proc.shape}, X_test_proc shape: {X_test_proc.shape}")
print(f"y_train_class distribution:\n{y_train_class.value_counts(normalize=True)}")
print(f"y_test_class distribution:\n{y_test_class.value_counts(normalize=True)}")



Identified 33 numerical features.
Identified 8 categorical features.

Fitting preprocessor and transforming data...
Preprocessing complete.
Total features after preprocessing: 152

Data split after preprocessing:
X_train_proc shape: (6000, 152), X_test_proc shape: (2000, 152)
y_train_class distribution:
Compatible_Class
1    0.766833
0    0.233167
Name: proportion, dtype: float64
y_test_class distribution:
Compatible_Class
1    0.7665
0    0.2335
Name: proportion, dtype: float64




In [44]:
# @title 6. Genetic Algorithm for Feature Selection Setup (Using XGBoost in Fitness)

# --- Add XGBoost imports ---
import xgboost as xgb
from sklearn.model_selection import cross_val_score # Already imported but good practice

# --- GA Parameters ---
# (Keep GA parameters like N_FEATURES, POP_SIZE_FS, NGEN_FS etc. the same)
N_FEATURES = X_train_proc.shape[1]
POP_SIZE_FS = 50
NGEN_FS = 20
CXPB_FS = 0.6
MUTPB_FS = 0.2
WEIGHT_CLASSIFICATION = 0.5
WEIGHT_REGRESSION = 0.5

# --- Fitness Function ---
y_train_reg_std = y_train_reg.std()
if y_train_reg_std == 0: y_train_reg_std = 1

def evaluate_feature_subset(individual, X_data, y_reg_data, y_class_data, y_reg_std_dev):
    """Fitness function using XGBoost for evaluation."""
    selected_indices = [i for i, bit in enumerate(individual) if bit == 1]
    if not selected_indices: return (0.0,)

    X_subset = X_data[:, selected_indices]

    # --- Evaluate Classifier (XGBoost) ---
    try:
        # *** USE XGBClassifier ***
        # scale_pos_weight helps with imbalance. Calculate ratio of majority to minority class.
        # Handle potential division by zero or case where only one class exists in y_class_data during CV split
        count0 = np.sum(y_class_data == 0)
        count1 = np.sum(y_class_data == 1)
        scale_pos_weight_val = count0 / count1 if count1 > 0 else 1

        clf = xgb.XGBClassifier(n_estimators=30, # Fewer estimators for speed
                                random_state=42,
                                use_label_encoder=False, # Recommended for newer XGBoost versions
                                eval_metric='logloss', # Common classification metric
                                scale_pos_weight=scale_pos_weight_val, # Handle imbalance
                                n_jobs=1, # Can sometimes conflict with CV n_jobs
                                max_depth=6) # Limit depth for speed

        class_scores = cross_val_score(clf, X_subset, y_class_data, cv=3, scoring='f1_weighted', n_jobs=-1)
        avg_class_score = np.mean(class_scores)
    except Exception as e:
        # print(f"Classifier CV Error: {e}") # Optional debug
        avg_class_score = 0.0

    # --- Evaluate Regressor (XGBoost) ---
    try:
        # *** USE XGBRegressor ***
        reg = xgb.XGBRegressor(n_estimators=30, # Fewer estimators for speed
                               random_state=42,
                               eval_metric='rmse',
                               n_jobs=1,
                               max_depth=6) # Limit depth for speed

        reg_scores = cross_val_score(reg, X_subset, y_reg_data, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
        avg_rmse = -np.mean(reg_scores)
        normalized_rmse = avg_rmse / y_reg_std_dev if y_reg_std_dev > 0 else avg_rmse
        reg_fitness_comp = max(0.0, 1.0 - normalized_rmse)
    except Exception as e:
        # print(f"Regressor CV Error: {e}") # Optional debug
        reg_fitness_comp = 0.0

    # --- Combine Scores ---
    combined_fitness = (WEIGHT_CLASSIFICATION * avg_class_score) + \
                       (WEIGHT_REGRESSION * reg_fitness_comp)

    return (combined_fitness,)

# --- DEAP Setup for Feature Selection (Binary Individuals) ---
# (DEAP setup remains exactly the same as before)
creator.create("FitnessMaxFS", base.Fitness, weights=(1.0,))
creator.create("IndividualFS", list, fitness=creator.FitnessMaxFS)
toolbox_fs = base.Toolbox()
toolbox_fs.register("attr_bool", random.randint, 0, 1)
toolbox_fs.register("individual", tools.initRepeat, creator.IndividualFS, toolbox_fs.attr_bool, N_FEATURES)
toolbox_fs.register("population", tools.initRepeat, list, toolbox_fs.individual)
toolbox_fs.register("evaluate", evaluate_feature_subset,
                    X_data=X_train_proc,
                    y_reg_data=y_train_reg,
                    y_class_data=y_train_class,
                    y_reg_std_dev=y_train_reg_std)
toolbox_fs.register("mate", tools.cxUniform, indpb=0.5)
toolbox_fs.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox_fs.register("select", tools.selTournament, tournsize=3)

print("\nDEAP toolbox for Feature Selection configured (using XGBoost in fitness).")


DEAP toolbox for Feature Selection configured (using XGBoost in fitness).




In [45]:
# @title 7. Run GA for Feature Selection

print(f"\nStarting GA Feature Selection: Population={POP_SIZE_FS}, Generations={NGEN_FS}")
start_time_fs = time.time()

pop_fs = toolbox_fs.population(n=POP_SIZE_FS)
hof_fs = tools.HallOfFame(1) # Keep only the best

stats_fs = tools.Statistics(lambda ind: ind.fitness.values)
stats_fs.register("avg", np.mean)
stats_fs.register("std", np.std)
stats_fs.register("min", np.min)
stats_fs.register("max", np.max)

# Run the GA
try:
    algorithms.eaSimple(pop_fs, toolbox_fs, cxpb=CXPB_FS, mutpb=MUTPB_FS, ngen=NGEN_FS,
                        stats=stats_fs, halloffame=hof_fs, verbose=True)
except Exception as e_ga:
    print(f"Error during GA execution: {e_ga}")
    traceback.print_exc()
    raise SystemExit("GA failed.")


end_time_fs = time.time()
print(f"GA Feature Selection finished in {end_time_fs - start_time_fs:.2f} seconds.")

# --- Extract Best Feature Set ---
if len(hof_fs) == 0:
     print("ERROR: HallOfFame is empty. GA might not have run correctly or found any valid individuals.")
     # Fallback: Use all features if GA fails? Or stop?
     # selected_feature_indices = list(range(N_FEATURES)) # Option: Use all
     raise SystemExit("GA did not produce a best individual.")
else:
    best_individual_fs = hof_fs[0]
    selected_feature_indices = [i for i, bit in enumerate(best_individual_fs) if bit == 1]
    if not selected_feature_indices:
        print("WARNING: GA selected zero features. Fitness function or GA parameters might need tuning. Using all features as fallback.")
        selected_feature_indices = list(range(N_FEATURES)) # Fallback to all features
    else:
         selected_feature_names = [feature_names_out[i] for i in selected_feature_indices]
         print(f"\nGA selected {len(selected_feature_indices)} features out of {N_FEATURES}.")
         # print("Selected feature names:", selected_feature_names) # Optional


Starting GA Feature Selection: Population=50, Generations=20
gen	nevals	avg     	std      	min     	max     
0  	50    	0.525597	0.0928546	0.382078	0.671878
1  	40    	0.609488	0.0599722	0.414749	0.680416
2  	34    	0.649101	0.0410421	0.445992	0.683529
3  	29    	0.667375	0.0166839	0.599959	0.693891
4  	39    	0.674205	0.0173656	0.611106	0.696591
5  	28    	0.685071	0.0127742	0.606028	0.6943  
6  	38    	0.687107	0.0153657	0.620431	0.700368
7  	38    	0.688507	0.0208491	0.568029	0.701131
8  	33    	0.683834	0.0458143	0.474374	0.702514
9  	28    	0.696262	0.00880264	0.650356	0.702992
10 	30    	0.697026	0.0104196 	0.632177	0.702992
11 	26    	0.698606	0.00682922	0.665303	0.703414
12 	30    	0.696827	0.0132032 	0.627962	0.703691
13 	33    	0.700209	0.00510466	0.667358	0.703691
14 	35    	0.699298	0.0110421 	0.63099 	0.705537
15 	26    	0.699958	0.0111172 	0.6281  	0.705537
16 	22    	0.70014 	0.0133487 	0.60922 	0.705537
17 	26    	0.698172	0.0176622 	0.60619 	0.705537
18 	29    	0.6992

In [46]:
# @title 8. Tune and Train Final Models (XGBoost) using Selected Features

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform # Use uniform for learning_rate etc.

print("\n--- Tuning and Training Final XGBoost Models on Selected Features ---")

# Select the best features from the training and test sets (as before)
X_train_selected = X_train_proc[:, selected_feature_indices]
X_test_selected = X_test_proc[:, selected_feature_indices]

print(f"X_train_selected shape: {X_train_selected.shape}")
print(f"X_test_selected shape: {X_test_selected.shape}")

# --- 1. Tune and Train Final Classifier (XGBoost) ---

# Define parameter distribution for Randomized Search (XGBoost specific)
param_dist_clf_xgb = {
    'n_estimators': randint(low=100, high=500),
    'max_depth': randint(low=3, high=10), # XGBoost often uses shallower trees
    'learning_rate': uniform(loc=0.01, scale=0.2), # Sample between 0.01 and 0.21
    'subsample': uniform(loc=0.6, scale=0.4), # Sample between 0.6 and 1.0
    'colsample_bytree': uniform(loc=0.6, scale=0.4), # Sample between 0.6 and 1.0
    'gamma': uniform(loc=0, scale=0.5), # Minimum loss reduction
    'scale_pos_weight': [y_train_class.value_counts()[0] / y_train_class.value_counts()[1] if y_train_class.value_counts()[1] > 0 else 1] # Use previously calculated ratio
}

# Create a base classifier instance
xgb_clf_tune = xgb.XGBClassifier(random_state=42,
                                 use_label_encoder=False,
                                 eval_metric='logloss',
                                 n_jobs=1) # Set n_jobs=1 in base estimator for tuning

# Setup Randomized Search
n_iter_search = 50 # Number of parameter settings to sample
random_search_clf_xgb = RandomizedSearchCV(estimator=xgb_clf_tune,
                                         param_distributions=param_dist_clf_xgb,
                                         n_iter=n_iter_search,
                                         cv=3,
                                         verbose=1,
                                         random_state=42,
                                         n_jobs=-1, # Use cores for CV folds
                                         scoring='accuracy') # Or 'f1_weighted'

print(f"\nTuning final XGBoost classifier ({n_iter_search} iterations)...")
start_time_clf_tune = time.time()
random_search_clf_xgb.fit(X_train_selected, y_train_class)
end_time_clf_tune = time.time()
print(f"Classifier tuning finished in {end_time_clf_tune - start_time_clf_tune:.2f} seconds.")

print("\nBest parameters found for classifier:", random_search_clf_xgb.best_params_)
print(f"Best cross-validation score ({random_search_clf_xgb.scoring}): {random_search_clf_xgb.best_score_:.4f}")

# Use the best estimator found by the search as the final classifier
final_classifier = random_search_clf_xgb.best_estimator_
print("Final classifier set to the best XGBoost estimator found.")


# --- 2. Tune and Train Final Regressor (XGBoost) ---

# Define parameter distribution for Randomized Search (XGBoost specific)
param_dist_reg_xgb = {
    'n_estimators': randint(low=100, high=500),
    'max_depth': randint(low=3, high=10),
    'learning_rate': uniform(loc=0.01, scale=0.2),
    'subsample': uniform(loc=0.6, scale=0.4),
    'colsample_bytree': uniform(loc=0.6, scale=0.4),
    'gamma': uniform(loc=0, scale=0.5),
    'reg_alpha': uniform(loc=0, scale=1), # L1 regularization
    'reg_lambda': uniform(loc=0, scale=1) # L2 regularization
}

# Create a base regressor instance
xgb_reg_tune = xgb.XGBRegressor(random_state=42,
                                eval_metric='rmse',
                                n_jobs=1) # Set n_jobs=1 in base estimator

# Setup Randomized Search
random_search_reg_xgb = RandomizedSearchCV(estimator=xgb_reg_tune,
                                         param_distributions=param_dist_reg_xgb,
                                         n_iter=n_iter_search,
                                         cv=3,
                                         verbose=1,
                                         random_state=42,
                                         n_jobs=-1, # Use cores for CV folds
                                         scoring='r2') # Or 'neg_root_mean_squared_error'

print(f"\nTuning final XGBoost regressor ({n_iter_search} iterations)...")
start_time_reg_tune = time.time()
random_search_reg_xgb.fit(X_train_selected, y_train_reg)
end_time_reg_tune = time.time()
print(f"Regressor tuning finished in {end_time_reg_tune - start_time_reg_tune:.2f} seconds.")

print("\nBest parameters found for regressor:", random_search_reg_xgb.best_params_)
print(f"Best cross-validation score ({random_search_reg_xgb.scoring}): {random_search_reg_xgb.best_score_:.4f}")

# Use the best estimator found by the search as the final regressor
final_regressor = random_search_reg_xgb.best_estimator_
print("Final regressor set to the best XGBoost estimator found.")

print("\n--- Final XGBoost Model Training/Tuning Complete ---")


--- Tuning and Training Final XGBoost Models on Selected Features ---
X_train_selected shape: (6000, 81)
X_test_selected shape: (2000, 81)

Tuning final XGBoost classifier (50 iterations)...
Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.



Classifier tuning finished in 138.89 seconds.

Best parameters found for classifier: {'colsample_bytree': np.float64(0.8447411578889518), 'gamma': np.float64(0.06974693032602092), 'learning_rate': np.float64(0.06842892970704363), 'max_depth': 9, 'n_estimators': 289, 'scale_pos_weight': np.float64(0.30406433384046944), 'subsample': np.float64(0.6362425738131283)}
Best cross-validation score (accuracy): 0.8853
Final classifier set to the best XGBoost estimator found.

Tuning final XGBoost regressor (50 iterations)...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Regressor tuning finished in 249.84 seconds.

Best parameters found for regressor: {'colsample_bytree': np.float64(0.6391336642604005), 'gamma': np.float64(0.24580793755841618), 'learning_rate': np.float64(0.10469435415611314), 'max_depth': 3, 'n_estimators': 353, 'reg_alpha': np.float64(0.43385164923797304), 'reg_lambda': np.float64(0.39850473439737344), 'subsample': np.float64(0.8463400392208866)}
Best cross-val

In [47]:
# @title 8. Train Final Models using Selected Features

print("\n--- Training Final Models on Selected Features ---")

# Select the best features from the training and test sets
try:
    X_train_selected = X_train_proc[:, selected_feature_indices]
    X_test_selected = X_test_proc[:, selected_feature_indices]
    print(f"X_train_selected shape: {X_train_selected.shape}")
    print(f"X_test_selected shape: {X_test_selected.shape}")
except IndexError as e_idx:
     print(f"Error selecting features with indices: {e_idx}")
     print(f"Selected indices: {selected_feature_indices}")
     print(f"X_train_proc shape: {X_train_proc.shape}")
     raise SystemExit("Feature selection failed.")


--- Training Final Models on Selected Features ---
X_train_selected shape: (6000, 81)
X_test_selected shape: (2000, 81)


In [48]:
# --- Train Final Classifier ---
# Use more robust parameters for the final models
final_classifier = RandomForestClassifier(n_estimators=150, # Increased estimators
                                         random_state=42,
                                         n_jobs=-1,
                                         max_depth=18,     # Slightly deeper
                                         min_samples_split=8, # Adjusted
                                         min_samples_leaf=4,  # Added min_samples_leaf
                                         class_weight='balanced' # Add class weighting if data is imbalanced
                                         )
print("Training final classifier...")
final_classifier.fit(X_train_selected, y_train_class)
print("Classifier training complete.")

# --- Train Final Regressor ---
final_regressor = RandomForestRegressor(n_estimators=150, # Increased estimators
                                        random_state=42,
                                        n_jobs=-1,
                                        max_depth=18,    # Slightly deeper
                                        min_samples_split=8, # Adjusted
                                        min_samples_leaf=4   # Added min_samples_leaf
                                        )
print("Training final regressor...")
final_regressor.fit(X_train_selected, y_train_reg)
print("Regressor training complete.")


Training final classifier...
Classifier training complete.
Training final regressor...
Regressor training complete.


In [49]:
from sklearn.metrics import mean_absolute_error


In [50]:
# @title 9. Evaluate Final Models

print("\n--- Evaluating Final CLASSIFIER on Test Set (Selected Features) ---")
try:
    y_pred_class = final_classifier.predict(X_test_selected)
    accuracy = accuracy_score(y_test_class, y_pred_class)
    f1 = f1_score(y_test_class, y_pred_class, average='weighted')
    print(f"Classifier Accuracy: {accuracy:.4f}")
    print(f"Classifier Weighted F1-Score: {f1:.4f}")
    # from sklearn.metrics import classification_report, confusion_matrix
    # print(classification_report(y_test_class, y_pred_class))
    # print(confusion_matrix(y_test_class, y_pred_class))
except Exception as e_eval_clf:
    print(f"Error during classifier evaluation: {e_eval_clf}")


print("\n--- Evaluating Final REGRESSOR on Test Set (Selected Features) ---")
try:
    y_pred_reg = final_regressor.predict(X_test_selected)
    r2 = r2_score(y_test_reg, y_pred_reg)
    rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
    mae = mean_absolute_error(y_test_reg, y_pred_reg)
    print(f"Regressor R-squared (R²): {r2:.4f}")
    print(f"Regressor Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Regressor Mean Absolute Error (MAE): {mae:.4f}")
except Exception as e_eval_reg:
     print(f"Error during regressor evaluation: {e_eval_reg}")


--- Evaluating Final CLASSIFIER on Test Set (Selected Features) ---
Classifier Accuracy: 0.8710
Classifier Weighted F1-Score: 0.8707

--- Evaluating Final REGRESSOR on Test Set (Selected Features) ---
Regressor R-squared (R²): 0.7594
Regressor Root Mean Squared Error (RMSE): 3.5785
Regressor Mean Absolute Error (MAE): 2.8056


In [51]:
# @title 10. Save Components & Define Prediction Function

# --- Define Min/Max for Percentage Conversion ---
# Verify these based on your ACTUAL calculate_ccs function logic
THEORETICAL_MIN_CCS = -50 # Example value
THEORETICAL_MAX_CCS = 85  # Example value

def convert_ccs_to_percentage(ccs_score, min_ccs=THEORETICAL_MIN_CCS, max_ccs=THEORETICAL_MAX_CCS):
    """Converts a raw CCS score to a percentage (0-100)."""
    if max_ccs == min_ccs: return 50.0
    clipped_score = np.clip(ccs_score, min_ccs, max_ccs)
    percentage = ((clipped_score - min_ccs) / (max_ccs - min_ccs)) * 100
    return percentage


In [52]:
# --- Save Model Components ---
# Saving preprocessor, selected indices, classifier, regressor, scale factors, threshold
save_components = {
    'preprocessor': preprocessor,
    'selected_feature_indices': selected_feature_indices,
    'classifier': final_classifier,
    'regressor': final_regressor,
    'min_ccs': THEORETICAL_MIN_CCS,
    'max_ccs': THEORETICAL_MAX_CCS,
    'compatibility_threshold': compatibility_threshold
}

# Ensure directory exists (defined earlier) before saving
try:
    with open(model_save_path, 'wb') as f:
        pickle.dump(save_components, f)
    print(f"\nModel components saved successfully to Google Drive: {model_save_path}")
except Exception as e:
    print(f"\nError saving model components: {e}")


# --- Load Model Function Definition ---
def load_combined_model(filepath):
    """Loads the saved model components."""
    if not os.path.exists(filepath):
        print(f"ERROR: Model file not found at {filepath}")
        return None
    try:
        with open(filepath, 'rb') as f:
            components = pickle.load(f)
        print(f"Model components loaded successfully from {filepath}")
        return components
    except Exception as e:
        print(f"Error loading model components: {e}")
        traceback.print_exc()
        return None


# --- Prediction Function Definition ---
def predict_cattle_compatibility(new_data_df, model_components):
    """
    Predicts Yes/No compatibility and percentage score for new cattle data.
    """
    if model_components is None:
        print("ERROR: Model components not loaded."); return None

    try:
        # Extract components
        preprocessor = model_components['preprocessor']
        selected_indices = model_components['selected_feature_indices']
        classifier = model_components['classifier']
        regressor = model_components['regressor']
        min_ccs = model_components['min_ccs']
        max_ccs = model_components['max_ccs']

        # 1. Preprocess the new data
        # Ensure input df has columns expected by preprocessor
        # Handle potential errors during transform
        try:
             X_new_processed = preprocessor.transform(new_data_df)
        except ValueError as ve:
             print(f"ValueError during preprocessing transform: {ve}")
             print("Ensure input DataFrame columns exactly match those used during preprocessor fitting.")
             # Optionally try to get expected columns:
             # if hasattr(preprocessor, 'feature_names_in_'):
             #      print("Preprocessor expected columns:", preprocessor.feature_names_in_)
             # else:
             #      # Need to infer expected columns from transformers if possible
             #      pass
             return None
        except Exception as e_prep:
            print(f"Error during preprocessing transform: {e_prep}")
            traceback.print_exc()
            return None


        # 2. Select the features identified by the GA
        try:
             X_new_selected = X_new_processed[:, selected_indices]
        except IndexError as e_idx_pred:
             print(f"Error selecting features during prediction: {e_idx_pred}")
             print(f"Processed data shape: {X_new_processed.shape}, Selected indices count: {len(selected_indices)}")
             return None


        # 3. Predict Class (0/1)
        class_predictions = classifier.predict(X_new_selected)

        # 4. Predict Raw CCS Score
        ccs_predictions = regressor.predict(X_new_selected)

        # 5. Convert CCS to Percentage
        percentage_predictions = [convert_ccs_to_percentage(score, min_ccs, max_ccs) for score in ccs_predictions]

        # 6. Format Output
        results = []
        for i in range(len(class_predictions)):
            prediction_label = "Yes" if class_predictions[i] == 1 else "No"
            results.append({
                "Prediction": prediction_label,
                "Confidence_Score_Percent": round(percentage_predictions[i], 2),
                "Raw_CCS_Score": round(ccs_predictions[i], 2)
            })
        return results

    except Exception as e_pred:
        print(f"An error occurred during prediction steps: {e_pred}")
        traceback.print_exc()
        return None

print("\nHelper functions for loading and prediction defined.")


Model components saved successfully to Google Drive: /content/drive/MyDrive/MyModels/cattle_predictor_v5.pkl

Helper functions for loading and prediction defined.


In [53]:
# @title 9. Evaluate Final Models

# --- This section calculates and prints the Classifier's Accuracy ---
print("\n--- Evaluating Final CLASSIFIER on Test Set (Selected Features) ---")

# 1. Predict class labels (0 or 1) on the test set using the selected features
y_pred_class = final_classifier.predict(X_test_selected)

# 2. Calculate accuracy by comparing predictions (y_pred_class) to the true labels (y_test_class)
accuracy = accuracy_score(y_test_class, y_pred_class)

# 3. Calculate F1-score (another useful classification metric)
f1 = f1_score(y_test_class, y_pred_class, average='weighted')

# 4. Print the results
print(f"Classifier Accuracy: {accuracy:.4f}") # <-- THIS IS THE ACCURACY ON THE TEST SET
print(f"Classifier Weighted F1-Score: {f1:.4f}")

# Optional detailed report (currently commented out)
# from sklearn.metrics import classification_report, confusion_matrix
# print(classification_report(y_test_class, y_pred_class))
# print(confusion_matrix(y_test_class, y_pred_class))


# --- This section evaluates the Regressor (predicting the score) ---
print("\n--- Evaluating Final REGRESSOR on Test Set (Selected Features) ---")
# (Code for R², RMSE, MAE follows...)


--- Evaluating Final CLASSIFIER on Test Set (Selected Features) ---
Classifier Accuracy: 0.8710
Classifier Weighted F1-Score: 0.8707

--- Evaluating Final REGRESSOR on Test Set (Selected Features) ---


In [54]:
# @title 11. Example Usage (with Added Sample Pair)

import pandas as pd # Ensure pandas is imported in this scope if needed
import numpy as np  # Ensure numpy is imported

print("\n--- Loading saved model and predicting on dummy data ---")
loaded_model = load_combined_model(model_save_path) # Assumes model_save_path is defined

if loaded_model:
    # --- Define the specific sample pair ---
    sample_pair = {
        'Cow': { 'Breed': 'Gir', 'Age': 6, 'Weight': 450, 'Height': 140, 'Milk_Yield': 8,
                 'Health_Status': 0, 'Drought_Resistance': 70, 'Temperament': 'Calm',
                 # Add other Cow keys expected by X, even if None/NaN in this sample
                 'Genetic_Diversity_Score': np.nan, 'Fertility_Rate': np.nan, 'Breeding_Success_Rate': np.nan,
                 'Disease_Resistance_Score': np.nan, 'Market_Value': np.nan, 'Mother_Milk_Yield': np.nan,
                 'Disease': np.nan, 'Past_Breeding_Success': np.nan, 'Same_Parents': np.nan
               },
        'Bull': {'Breed': 'Jersey', 'Age': 7, 'Weight': 470, 'Height': 142, 'Health_Status': 0,
                 'Mother_Milk_Yield': 9, 'Drought_Resistance': 75, 'Temperament': 'Calm',
                 # Add other Bull keys expected by X, even if None/NaN in this sample
                 'Milk_Yield': np.nan, 'Genetic_Diversity_Score': np.nan, 'Fertility_Rate': np.nan,
                 'Breeding_Success_Rate': np.nan, 'Disease_Resistance_Score': np.nan, 'Market_Value': np.nan,
                 'Disease': np.nan, 'Past_Breeding_Success': np.nan, 'Same_Parents': np.nan
                },
        # Top-level keys expected by X
        'Same_Parents': 0,
        'Trait_Difference': 18,
        'Genetic_Diversity': 8, # This might override individual scores depending on how X was defined
        'Fertility_Rate': 65,
        'Breeding_Success_Rate': 55,
        'Disease_Resistance_Score': 6.5,
        'Market_Value': 25000,
        'Past_Breeding_Success': 'High'
    }

    # --- Flatten the sample_pair into a dictionary matching DataFrame columns ---
    flat_sample = {}
    for prefix, inner_dict in sample_pair.items():
        if isinstance(inner_dict, dict):
            for key, value in inner_dict.items():
                flat_sample[f"{prefix}_{key}"] = value
        else:
            # Handle top-level keys directly
            flat_sample[prefix] = inner_dict

    # Convert the flattened sample to a DataFrame row
    new_pair_df = pd.DataFrame([flat_sample])
    print("Flattened sample pair prepared.")


    # --- Create original dummy data ---
    dummy_data = {
        'Cow_Breed': ['Angus', 'Holstein', 'UnknownBreed'], 'Cow_Age': [5, 6, 7], 'Cow_Weight': [550.0, 600.0, 580.0],
        'Cow_Height': [130.0, 140.0, 135.0], 'Cow_Milk_Yield': [8.5, 9.0, np.nan], 'Cow_Health_Status': [0, 1, 0],
        'Cow_Genetic_Diversity_Score': [7.5, 8.1, 7.0], 'Cow_Fertility_Rate': [60.0, 65.0, 55.0],
        'Cow_Breeding_Success_Rate': [50.0, 55.0, 45.0], 'Cow_Drought_Resistance': [70.0, 60.0, 65.0],
        'Cow_Disease_Resistance_Score': [6.0, 7.0, 5.5], 'Cow_Market_Value': [15000, 18000, 16000],
        'Cow_Temperament': ['Calm', 'Calm', 'Aggressive'], 'Cow_Mother_Milk_Yield': [7.0, 7.5, 6.5],
        'Cow_Disease': ['FootRot', 'Mastitis', np.nan], 'Cow_Past_Breeding_Success': ['Moderate', 'High', 'Low'],
        'Cow_Same_Parents': [0, 0, 1], # Note: This might conflict with top-level Same_Parents if kept
        'Bull_Breed': ['Brahman', 'Angus', 'Brahman'], 'Bull_Age': [4, 5, 6], 'Bull_Weight': [650.0, 680.0, 700.0],
        'Bull_Height': [145.0, 150.0, 155.0], 'Bull_Milk_Yield': [np.nan, np.nan, np.nan],
        'Bull_Health_Status': [0, 0, 1], 'Bull_Genetic_Diversity_Score': [8.0, 7.8, 7.5],
        'Bull_Fertility_Rate': [70.0, 75.0, 68.0], 'Bull_Breeding_Success_Rate': [60.0, 65.0, 58.0],
        'Bull_Drought_Resistance': [80.0, 50.0, 75.0], 'Bull_Disease_Resistance_Score': [7.5, 6.5, 7.0],
        'Bull_Market_Value': [20000, 22000, 21000], 'Bull_Temperament': ['Aggressive', 'Calm', 'Calm'],
        'Bull_Mother_Milk_Yield': [np.nan, np.nan, np.nan], 'Bull_Disease': ['None', 'BLV', 'FootRot'],
        'Bull_Past_Breeding_Success': ['High', 'Moderate', 'Moderate'],
        'Bull_Same_Parents': [0, 1, 0], # Note: This might conflict with top-level Same_Parents if kept
        # --- Top-level combined features (as used in Feature Engineering) ---
        'Same_Parents': [0, 1, 0], # Example values matching number of rows
        'Trait_Difference': [15, 25, 10],
        'Genetic_Diversity': [8, 6, 7.5],
        'Fertility_Rate': [65, 70, 60], # Example values
        'Breeding_Success_Rate': [55, 60, 50], # Example values
        'Disease_Resistance_Score': [6.5, 7.0, 6.0], # Example values
        'Market_Value': [25000, 19000, 23000], # Example values
        # Add *ALL* other columns present in X (before preprocessing)
    }
    original_dummy_df = pd.DataFrame(dummy_data)

    # --- Concatenate the original dummy data and the new sample pair ---
    combined_dummy_df = pd.concat([original_dummy_df, new_pair_df], ignore_index=True)
    print(f"Combined dummy data shape: {combined_dummy_df.shape}")


    # --- Dynamically ensure all columns from training X are present ---
    final_dummy_df = None # Initialize
    if 'preprocessor' in loaded_model:
        try:
            # Get expected feature names from the fitted preprocessor
            if hasattr(loaded_model['preprocessor'], 'feature_names_in_'):
                expected_cols = loaded_model['preprocessor'].feature_names_in_
            else:
                 print("Warning: Cannot automatically determine expected columns from preprocessor. Using columns from Cell 4's X.")
                 # This assumes 'X' from cell 4 is still available and correct. It's less robust.
                 if 'X' in globals():
                     expected_cols = list(X.columns)
                 else:
                     raise ValueError("Original X dataframe not available to determine expected columns.")

            print(f"\nPreprocessor expects {len(expected_cols)} columns for prediction.")

            # Check for missing columns in the combined dummy data
            current_dummy_cols = combined_dummy_df.columns
            missing_in_dummy = [col for col in expected_cols if col not in current_dummy_cols]
            if missing_in_dummy:
                print(f"Adding missing expected columns to combined dummy data: {missing_in_dummy}")
                for col in missing_in_dummy:
                    combined_dummy_df[col] = np.nan # Add missing columns with NaN

            # Select and reorder columns to match preprocessor's expectation
            try:
                final_dummy_df = combined_dummy_df[expected_cols] # Ensure correct order and columns
                print("Combined dummy data columns aligned with preprocessor expectations.")
            except KeyError as e_key:
                 print(f"KeyError aligning dummy data columns: {e_key}. Check column names in dummy data and expected columns.")
                 final_dummy_df = None # Prevent prediction if alignment fails
            except Exception as e_align:
                 print(f"Error aligning dummy data columns: {e_align}")
                 final_dummy_df = None

        except Exception as e_cols:
             print(f"Error preparing dummy data columns: {e_cols}")
             final_dummy_df = None
    else:
         print("ERROR: Preprocessor not found in loaded model components.")
         final_dummy_df = None

    # --- Run Prediction if dummy data is ready ---
    if final_dummy_df is not None:
        predictions = predict_cattle_compatibility(final_dummy_df, loaded_model) # Use the final aligned df

        if predictions:
            print("\n--- Predictions for Dummy Data (including added sample) ---")
            results_df = pd.DataFrame(predictions)
            # Add an identifier column for clarity
            results_df['Source'] = ['Dummy'] * len(original_dummy_df) + ['Added Sample'] * len(new_pair_df)
            print(results_df.to_string()) # Print full DataFrame results
        else:
            print("Prediction function returned None (failed).")
    else:
         print("Dummy data preparation failed, skipping prediction.")

else:
    print("Could not load model components to run prediction example.")


--- Loading saved model and predicting on dummy data ---
Model components loaded successfully from /content/drive/MyDrive/MyModels/cattle_predictor_v5.pkl
Flattened sample pair prepared.
Combined dummy data shape: (4, 42)

Preprocessor expects 41 columns for prediction.
Adding missing expected columns to combined dummy data: ['FE_Age_Diff', 'FE_Weight_Diff_Pct', 'FE_Height_Diff_Pct', 'FE_Milk_Sum', 'FE_Drought_Diff', 'FE_Combined_Health', 'FE_Temperament_Combo']
Combined dummy data columns aligned with preprocessor expectations.

--- Predictions for Dummy Data (including added sample) ---
  Prediction  Confidence_Score_Percent  Raw_CCS_Score        Source
0        Yes                     58.49          28.96         Dummy
1        Yes                     58.80          29.38         Dummy
2        Yes                     53.35          22.02         Dummy
3        Yes                     55.26          24.60  Added Sample


