In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# @title 1. Install Libraries
!pip install deap openpyxl scikit-learn --quiet
print("Required libraries checked/installed.")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m133.1/135.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hRequired libraries checked/installed.


In [4]:
# @title 2. Import Libraries & Mount Google Drive
import pandas as pd
import numpy as np
import random
import pickle
import os
import time
import traceback # For detailed error printing

from google.colab import drive

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Import DEAP components after installation
try:
    from deap import base, creator, tools, algorithms
except ImportError:
    print("DEAP library not found after installation attempt. Please check installation.")
    raise SystemExit("DEAP required.")


print("Libraries imported.")

Libraries imported.


In [5]:
# @title 3. Define Data Path, Target Column, Threshold & Load Data

# === Configuration ===
# --- PATH TO YOUR *NEW* DATASET (with correct CCS score) ---
data_path = '/content/drive/MyDrive/MatchFound.xlsx' # <--- *** REPLACE THIS PATH ***
# --- NAME OF THE COLUMN WITH THE RAW CCS SCORE ---
target_score_column = 'Compatibility_Score' # <--- *** VERIFY/CHANGE THIS COLUMN NAME ***
# --- THRESHOLD FOR YES/NO CLASSIFICATION ---
compatibility_threshold = 20 # <--- *** VERIFY/CHANGE THIS THRESHOLD ***
# --- PATH TO SAVE FINAL MODEL COMPONENTS ---
# Create the directory for saving if it doesn't exist
model_save_directory = '/content/drive/MyDrive/MyModels' # Define directory
model_save_path = os.path.join(model_save_directory, 'cattle_predictor_v2.pkl') # Define full path
if not os.path.exists(model_save_directory):
    print(f"Creating save directory: {model_save_directory}")
    os.makedirs(model_save_directory)


print(f"Attempting to load data from: {data_path}")
if not os.path.exists(data_path):
    print(f"ERROR: File not found: {data_path}"); raise SystemExit("Dataset not found.")

try:
    df = pd.read_excel(data_path, engine='openpyxl')
    print(f"Dataset loaded successfully. Shape: {df.shape}")
    if target_score_column not in df.columns:
        print(f"ERROR: Target score column '{target_score_column}' not found!")
        print(f"Available columns: {df.columns.tolist()}")
        raise SystemExit("Target column missing.")
except Exception as e:
    print(f"Error loading XLSX: {e}"); raise SystemExit("Data loading failed.")


Attempting to load data from: /content/drive/MyDrive/MatchFound.xlsx
Dataset loaded successfully. Shape: (8000, 38)


In [6]:
# @title 4. Create Targets & Define Features

# Create Binary Classification Target
binary_target_column = 'Compatible_Class'
df[binary_target_column] = (df[target_score_column] >= compatibility_threshold).astype(int)

print(f"\nCreated binary target '{binary_target_column}' based on threshold >= {compatibility_threshold}")
print(f"Class distribution:\n{df[binary_target_column].value_counts(normalize=True)}")

# Define Targets (y) and Features (X)
y_reg = df[target_score_column]
y_class = df[binary_target_column]

# Ensure columns_to_exclude contains the correct target column name
columns_to_exclude = [target_score_column, binary_target_column, 'Cow_ID', 'Bull_ID', 'Compatible'] # Add 'Compatible' if it exists from old versions
# Check if 'Compatibility_Score' should also be excluded if it's different from target_score_column
if 'Compatibility_Score' in df.columns and 'Compatibility_Score' != target_score_column:
     columns_to_exclude.append('Compatibility_Score')

columns_to_exclude = [col for col in columns_to_exclude if col in df.columns] # Keep only existing columns
X = df.drop(columns=columns_to_exclude)

print(f"\nFeatures (X shape): {X.shape}")
# print("Feature columns:", X.columns.tolist()) # Uncomment to check features
print(f"Regression Target (y_reg shape): {y_reg.shape}")
print(f"Classification Target (y_class shape): {y_class.shape}")


Created binary target 'Compatible_Class' based on threshold >= 20
Class distribution:
Compatible_Class
1    0.76675
0    0.23325
Name: proportion, dtype: float64

Features (X shape): (8000, 34)
Regression Target (y_reg shape): (8000,)
Classification Target (y_class shape): (8000,)


In [7]:
# @title 5. Preprocessing Setup & Application

# Identify feature types
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nIdentified {len(numerical_features)} numerical features.")
print(f"Identified {len(categorical_features)} categorical features.")

# Define preprocessing steps (using pipelines)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]) # handle_unknown is important

# Create the preprocessor object
# Ensure remainder='passthrough' only if you intend to keep non-numeric/non-categorical columns unprocessed
# Usually, it's better to handle all columns explicitly. If X only contains num/cat, remainder='drop' is safer.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='drop') # Drop columns not specified as numerical or categorical


# Fit the preprocessor and transform the data *before* splitting
print("\nFitting preprocessor and transforming data...")
try:
    X_processed = preprocessor.fit_transform(X)
    print("Preprocessing complete.")
    # Get feature names after OneHotEncoding
    try:
        feature_names_out = preprocessor.get_feature_names_out()
        print(f"Total features after preprocessing: {len(feature_names_out)}")
    except Exception as e:
        print(f"Could not get feature names from preprocessor: {e}")
        num_processed_features = X_processed.shape[1]
        feature_names_out = [f"feature_{i}" for i in range(num_processed_features)]
        print(f"Using generic feature names. Total features: {num_processed_features}")

except Exception as e:
    print(f"Error during preprocessing fit_transform: {e}")
    traceback.print_exc()
    raise SystemExit("Preprocessing failed.")


# Split PREPROCESSED data
# Stratify ensures similar class distribution in train/test splits
X_train_proc, X_test_proc, y_train_reg, y_test_reg, y_train_class, y_test_class = train_test_split(
    X_processed, y_reg, y_class, test_size=0.25, random_state=42, stratify=y_class
)

print(f"\nData split after preprocessing:")
print(f"X_train_proc shape: {X_train_proc.shape}, X_test_proc shape: {X_test_proc.shape}")
print(f"y_train_class distribution:\n{y_train_class.value_counts(normalize=True)}")
print(f"y_test_class distribution:\n{y_test_class.value_counts(normalize=True)}")



Identified 26 numerical features.
Identified 8 categorical features.

Fitting preprocessor and transforming data...




Preprocessing complete.
Total features after preprocessing: 145

Data split after preprocessing:
X_train_proc shape: (6000, 145), X_test_proc shape: (2000, 145)
y_train_class distribution:
Compatible_Class
1    0.766833
0    0.233167
Name: proportion, dtype: float64
y_test_class distribution:
Compatible_Class
1    0.7665
0    0.2335
Name: proportion, dtype: float64


In [8]:
# @title 6. Genetic Algorithm for Feature Selection Setup

# --- GA Parameters ---
N_FEATURES = X_train_proc.shape[1]
# --- MODIFICATION: Increased GA Parameters ---
POP_SIZE_FS = 80   # Increased Population size (was 50)
NGEN_FS = 30       # Increased Number of generations (was 20)
# --- End Modification ---
CXPB_FS = 0.6      # Crossover probability
MUTPB_FS = 0.2     # Mutation probability (for an individual)
# Fitness function weights (adjust based on priority)
WEIGHT_CLASSIFICATION = 0.5
WEIGHT_REGRESSION = 0.5

# --- Fitness Function ---
# Calculate std dev of regression target once for normalization
y_train_reg_std = y_train_reg.std()
if y_train_reg_std == 0: y_train_reg_std = 1 # Avoid division by zero

def evaluate_feature_subset(individual, X_data, y_reg_data, y_class_data, y_reg_std_dev):
    """Fitness function for GA Feature Selection."""
    selected_indices = [i for i, bit in enumerate(individual) if bit == 1]

    if not selected_indices: return (0.0,)

    X_subset = X_data[:, selected_indices]

    # --- Evaluate Classifier (CV) ---
    try:
        # Use slightly faster settings for CV during fitness eval
        clf = RandomForestClassifier(n_estimators=25, random_state=42, n_jobs=2, max_depth=8, min_samples_leaf=5)
        class_scores = cross_val_score(clf, X_subset, y_class_data, cv=3, scoring='f1_weighted', n_jobs=2) # Use 2 jobs for CV
        avg_class_score = np.mean(class_scores)
    except ValueError: # Handles cases where a split might have only one class
         avg_class_score = 0.0
    except Exception as e_clf:
        # print(f"Classifier CV Error: {e_clf}") # Optional debug
        avg_class_score = 0.0

    # --- Evaluate Regressor (CV) ---
    try:
        # Use slightly faster settings for CV during fitness eval
        reg = RandomForestRegressor(n_estimators=25, random_state=42, n_jobs=2, max_depth=8, min_samples_leaf=5)
        reg_scores = cross_val_score(reg, X_subset, y_reg_data, cv=3, scoring='neg_root_mean_squared_error', n_jobs=2) # Use 2 jobs for CV
        avg_rmse = -np.mean(reg_scores)
        normalized_rmse = avg_rmse / y_reg_std_dev if y_reg_std_dev > 0 else avg_rmse
        reg_fitness_comp = max(0.0, 1.0 - normalized_rmse)
    except Exception as e_reg:
        # print(f"Regressor CV Error: {e_reg}") # Optional debug
        reg_fitness_comp = 0.0

    # --- Combine Scores ---
    combined_fitness = (WEIGHT_CLASSIFICATION * avg_class_score) + \
                       (WEIGHT_REGRESSION * reg_fitness_comp)

    return (combined_fitness,)

# --- DEAP Setup for Feature Selection (Binary Individuals) ---
# Clear previous creations if re-running cell
if "FitnessMaxFS" in creator.__dict__: del creator.FitnessMaxFS
if "IndividualFS" in creator.__dict__: del creator.IndividualFS

creator.create("FitnessMaxFS", base.Fitness, weights=(1.0,)) # Maximize combined score
creator.create("IndividualFS", list, fitness=creator.FitnessMaxFS) # Individual is list of 0s/1s

toolbox_fs = base.Toolbox()
toolbox_fs.register("attr_bool", random.randint, 0, 1)
toolbox_fs.register("individual", tools.initRepeat, creator.IndividualFS, toolbox_fs.attr_bool, N_FEATURES)
toolbox_fs.register("population", tools.initRepeat, list, toolbox_fs.individual)
toolbox_fs.register("evaluate", evaluate_feature_subset,
                    X_data=X_train_proc,
                    y_reg_data=y_train_reg,
                    y_class_data=y_train_class,
                    y_reg_std_dev=y_train_reg_std)
toolbox_fs.register("mate", tools.cxUniform, indpb=0.5)
toolbox_fs.register("mutate", tools.mutFlipBit, indpb=0.05) # Prob per bit
toolbox_fs.register("select", tools.selTournament, tournsize=3)

print("\nDEAP toolbox for Feature Selection configured.")


DEAP toolbox for Feature Selection configured.


In [9]:
# @title 7. Run GA for Feature Selection

print(f"\nStarting GA Feature Selection: Population={POP_SIZE_FS}, Generations={NGEN_FS}")
start_time_fs = time.time()

pop_fs = toolbox_fs.population(n=POP_SIZE_FS)
hof_fs = tools.HallOfFame(1) # Keep only the best

stats_fs = tools.Statistics(lambda ind: ind.fitness.values)
stats_fs.register("avg", np.mean)
stats_fs.register("std", np.std)
stats_fs.register("min", np.min)
stats_fs.register("max", np.max)

# Run the GA
try:
    algorithms.eaSimple(pop_fs, toolbox_fs, cxpb=CXPB_FS, mutpb=MUTPB_FS, ngen=NGEN_FS,
                        stats=stats_fs, halloffame=hof_fs, verbose=True)
except Exception as e_ga:
    print(f"Error during GA execution: {e_ga}")
    traceback.print_exc()
    raise SystemExit("GA failed.")


end_time_fs = time.time()
print(f"GA Feature Selection finished in {end_time_fs - start_time_fs:.2f} seconds.")

# --- Extract Best Feature Set ---
if len(hof_fs) == 0:
     print("ERROR: HallOfFame is empty. GA might not have run correctly or found any valid individuals.")
     # Fallback: Use all features if GA fails? Or stop?
     # selected_feature_indices = list(range(N_FEATURES)) # Option: Use all
     raise SystemExit("GA did not produce a best individual.")
else:
    best_individual_fs = hof_fs[0]
    selected_feature_indices = [i for i, bit in enumerate(best_individual_fs) if bit == 1]
    if not selected_feature_indices:
        print("WARNING: GA selected zero features. Fitness function or GA parameters might need tuning. Using all features as fallback.")
        selected_feature_indices = list(range(N_FEATURES)) # Fallback to all features
    else:
         selected_feature_names = [feature_names_out[i] for i in selected_feature_indices]
         print(f"\nGA selected {len(selected_feature_indices)} features out of {N_FEATURES}.")
         # print("Selected feature names:", selected_feature_names) # Optional


Starting GA Feature Selection: Population=80, Generations=30
gen	nevals	avg     	std     	min     	max    
0  	80    	0.407367	0.048741	0.341118	0.50019
1  	55    	0.444584	0.0448656	0.35882 	0.504514
2  	53    	0.478191	0.0275316	0.393145	0.503519
3  	62    	0.491292	0.0157661	0.402939	0.510553
4  	56    	0.489898	0.0238132	0.389221	0.504431
5  	55    	0.497751	0.00419147	0.48849 	0.508599
6  	56    	0.497397	0.0120198 	0.398392	0.509158
7  	61    	0.499267	0.01184   	0.408951	0.513735
8  	40    	0.501524	0.0114537 	0.407416	0.513735
9  	59    	0.5016  	0.0119445 	0.402569	0.513735
10 	56    	0.500648	0.018493  	0.373412	0.517385
11 	64    	0.502733	0.0155467 	0.376833	0.514163
12 	51    	0.505247	0.00672559	0.466351	0.516157
13 	57    	0.504267	0.0165759 	0.410118	0.516157
14 	47    	0.505227	0.0179147 	0.401243	0.516157
15 	51    	0.508916	0.00540433	0.496442	0.518298
16 	56    	0.507663	0.0162916 	0.370052	0.516157
17 	63    	0.508024	0.0136098 	0.405415	0.516157
18 	51    	0.5093

In [10]:
# @title 8. Train Final Models using Selected Features

print("\n--- Training Final Models on Selected Features ---")

# Select the best features from the training and test sets
try:
    X_train_selected = X_train_proc[:, selected_feature_indices]
    X_test_selected = X_test_proc[:, selected_feature_indices]
    print(f"X_train_selected shape: {X_train_selected.shape}")
    print(f"X_test_selected shape: {X_test_selected.shape}")
except IndexError as e_idx:
     print(f"Error selecting features with indices: {e_idx}")
     print(f"Selected indices: {selected_feature_indices}")
     print(f"X_train_proc shape: {X_train_proc.shape}")
     raise SystemExit("Feature selection failed.")


--- Training Final Models on Selected Features ---
X_train_selected shape: (6000, 50)
X_test_selected shape: (2000, 50)


In [11]:
# --- Train Final Classifier ---
# Use more robust parameters for the final models
final_classifier = RandomForestClassifier(n_estimators=150, # Increased estimators
                                         random_state=42,
                                         n_jobs=-1,
                                         max_depth=18,     # Slightly deeper
                                         min_samples_split=8, # Adjusted
                                         min_samples_leaf=4,  # Added min_samples_leaf
                                         class_weight='balanced' # Add class weighting if data is imbalanced
                                         )
print("Training final classifier...")
final_classifier.fit(X_train_selected, y_train_class)
print("Classifier training complete.")

# --- Train Final Regressor ---
final_regressor = RandomForestRegressor(n_estimators=150, # Increased estimators
                                        random_state=42,
                                        n_jobs=-1,
                                        max_depth=18,    # Slightly deeper
                                        min_samples_split=8, # Adjusted
                                        min_samples_leaf=4   # Added min_samples_leaf
                                        )
print("Training final regressor...")
final_regressor.fit(X_train_selected, y_train_reg)
print("Regressor training complete.")


Training final classifier...
Classifier training complete.
Training final regressor...
Regressor training complete.


In [16]:
from sklearn.metrics import mean_absolute_error


In [17]:
# @title 9. Evaluate Final Models

print("\n--- Evaluating Final CLASSIFIER on Test Set (Selected Features) ---")
try:
    y_pred_class = final_classifier.predict(X_test_selected)
    accuracy = accuracy_score(y_test_class, y_pred_class)
    f1 = f1_score(y_test_class, y_pred_class, average='weighted')
    print(f"Classifier Accuracy: {accuracy:.4f}")
    print(f"Classifier Weighted F1-Score: {f1:.4f}")
    # from sklearn.metrics import classification_report, confusion_matrix
    # print(classification_report(y_test_class, y_pred_class))
    # print(confusion_matrix(y_test_class, y_pred_class))
except Exception as e_eval_clf:
    print(f"Error during classifier evaluation: {e_eval_clf}")


print("\n--- Evaluating Final REGRESSOR on Test Set (Selected Features) ---")
try:
    y_pred_reg = final_regressor.predict(X_test_selected)
    r2 = r2_score(y_test_reg, y_pred_reg)
    rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
    mae = mean_absolute_error(y_test_reg, y_pred_reg)
    print(f"Regressor R-squared (R²): {r2:.4f}")
    print(f"Regressor Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Regressor Mean Absolute Error (MAE): {mae:.4f}")
except Exception as e_eval_reg:
     print(f"Error during regressor evaluation: {e_eval_reg}")


--- Evaluating Final CLASSIFIER on Test Set (Selected Features) ---
Classifier Accuracy: 0.8285
Classifier Weighted F1-Score: 0.8217

--- Evaluating Final REGRESSOR on Test Set (Selected Features) ---
Regressor R-squared (R²): 0.5940
Regressor Root Mean Squared Error (RMSE): 4.6490
Regressor Mean Absolute Error (MAE): 3.7374


In [18]:
# @title 10. Save Components & Define Prediction Function

# --- Define Min/Max for Percentage Conversion ---
# Verify these based on your ACTUAL calculate_ccs function logic
THEORETICAL_MIN_CCS = -50 # Example value
THEORETICAL_MAX_CCS = 85  # Example value

def convert_ccs_to_percentage(ccs_score, min_ccs=THEORETICAL_MIN_CCS, max_ccs=THEORETICAL_MAX_CCS):
    """Converts a raw CCS score to a percentage (0-100)."""
    if max_ccs == min_ccs: return 50.0
    clipped_score = np.clip(ccs_score, min_ccs, max_ccs)
    percentage = ((clipped_score - min_ccs) / (max_ccs - min_ccs)) * 100
    return percentage


In [19]:
# --- Save Model Components ---
# Saving preprocessor, selected indices, classifier, regressor, scale factors, threshold
save_components = {
    'preprocessor': preprocessor,
    'selected_feature_indices': selected_feature_indices,
    'classifier': final_classifier,
    'regressor': final_regressor,
    'min_ccs': THEORETICAL_MIN_CCS,
    'max_ccs': THEORETICAL_MAX_CCS,
    'compatibility_threshold': compatibility_threshold
}

# Ensure directory exists (defined earlier) before saving
try:
    with open(model_save_path, 'wb') as f:
        pickle.dump(save_components, f)
    print(f"\nModel components saved successfully to Google Drive: {model_save_path}")
except Exception as e:
    print(f"\nError saving model components: {e}")


# --- Load Model Function Definition ---
def load_combined_model(filepath):
    """Loads the saved model components."""
    if not os.path.exists(filepath):
        print(f"ERROR: Model file not found at {filepath}")
        return None
    try:
        with open(filepath, 'rb') as f:
            components = pickle.load(f)
        print(f"Model components loaded successfully from {filepath}")
        return components
    except Exception as e:
        print(f"Error loading model components: {e}")
        traceback.print_exc()
        return None


# --- Prediction Function Definition ---
def predict_cattle_compatibility(new_data_df, model_components):
    """
    Predicts Yes/No compatibility and percentage score for new cattle data.
    """
    if model_components is None:
        print("ERROR: Model components not loaded."); return None

    try:
        # Extract components
        preprocessor = model_components['preprocessor']
        selected_indices = model_components['selected_feature_indices']
        classifier = model_components['classifier']
        regressor = model_components['regressor']
        min_ccs = model_components['min_ccs']
        max_ccs = model_components['max_ccs']

        # 1. Preprocess the new data
        # Ensure input df has columns expected by preprocessor
        # Handle potential errors during transform
        try:
             X_new_processed = preprocessor.transform(new_data_df)
        except ValueError as ve:
             print(f"ValueError during preprocessing transform: {ve}")
             print("Ensure input DataFrame columns exactly match those used during preprocessor fitting.")
             # Optionally try to get expected columns:
             # if hasattr(preprocessor, 'feature_names_in_'):
             #      print("Preprocessor expected columns:", preprocessor.feature_names_in_)
             # else:
             #      # Need to infer expected columns from transformers if possible
             #      pass
             return None
        except Exception as e_prep:
            print(f"Error during preprocessing transform: {e_prep}")
            traceback.print_exc()
            return None


        # 2. Select the features identified by the GA
        try:
             X_new_selected = X_new_processed[:, selected_indices]
        except IndexError as e_idx_pred:
             print(f"Error selecting features during prediction: {e_idx_pred}")
             print(f"Processed data shape: {X_new_processed.shape}, Selected indices count: {len(selected_indices)}")
             return None


        # 3. Predict Class (0/1)
        class_predictions = classifier.predict(X_new_selected)

        # 4. Predict Raw CCS Score
        ccs_predictions = regressor.predict(X_new_selected)

        # 5. Convert CCS to Percentage
        percentage_predictions = [convert_ccs_to_percentage(score, min_ccs, max_ccs) for score in ccs_predictions]

        # 6. Format Output
        results = []
        for i in range(len(class_predictions)):
            prediction_label = "Yes" if class_predictions[i] == 1 else "No"
            results.append({
                "Prediction": prediction_label,
                "Confidence_Score_Percent": round(percentage_predictions[i], 2),
                "Raw_CCS_Score": round(ccs_predictions[i], 2)
            })
        return results

    except Exception as e_pred:
        print(f"An error occurred during prediction steps: {e_pred}")
        traceback.print_exc()
        return None

print("\nHelper functions for loading and prediction defined.")


Model components saved successfully to Google Drive: /content/drive/MyDrive/MyModels/cattle_predictor_v2.pkl

Helper functions for loading and prediction defined.


In [15]:
# @title 11. Example Usage

print("\n--- Loading saved model and predicting on dummy data ---")
loaded_model = load_combined_model(model_save_path)

if loaded_model:
    # Create dummy data - MUST match columns of original X BEFORE preprocessing
    # Example using minimal representative data
    dummy_data = {
        # --- Numerical ---
        'Cow_Age': [5, 7], 'Cow_Weight': [550.0, 610.0], 'Cow_Height': [130.0, 138.0],
        'Cow_Milk_Yield': [8.5, 7.0], 'Cow_Genetic_Diversity_Score': [7.5, 6.8],
        'Cow_Fertility_Rate': [60.0, 75.0], 'Cow_Breeding_Success_Rate': [50.0, 65.0],
        'Cow_Drought_Resistance': [70.0, 55.0], 'Cow_Disease_Resistance_Score': [6.0, 7.2],
        'Cow_Market_Value': [15000, 21000], 'Cow_Mother_Milk_Yield': [7.0, 8.1],
        'Bull_Age': [4, 6], 'Bull_Weight': [650.0, 710.0], 'Bull_Height': [145.0, 152.0],
        'Bull_Milk_Yield': [np.nan, np.nan], # Use NaN if typically missing
        'Bull_Genetic_Diversity_Score': [8.0, 7.1], 'Bull_Fertility_Rate': [70.0, 65.0],
        'Bull_Breeding_Success_Rate': [60.0, 55.0], 'Bull_Drought_Resistance': [80.0, 60.0],
        'Bull_Disease_Resistance_Score': [7.5, 6.8], 'Bull_Market_Value': [20000, 19000],
        'Bull_Mother_Milk_Yield': [np.nan, np.nan], # Use NaN if typically missing
        # --- Categorical ---
        'Cow_Breed': ['Angus', 'Brahman'], 'Cow_Health_Status': [0, 1], # Assuming numeric encoding used directly if applicable, else string
        'Cow_Temperament': ['Calm', 'Aggressive'], 'Cow_Disease': ['FootRot', 'None'],
        'Cow_Past_Breeding_Success': ['Moderate', 'High'], 'Cow_Same_Parents': [0, 1], # Assuming numeric
        'Bull_Breed': ['Brahman', 'Angus'], 'Bull_Health_Status': [0, 0],
        'Bull_Temperament': ['Aggressive', 'Calm'], 'Bull_Disease': ['None', 'BLV'],
        'Bull_Past_Breeding_Success': ['High', 'Low'], 'Bull_Same_Parents': [0, 0],
        # --- Features from your specific calculate_ccs if used as direct inputs ---
        'Trait_Difference': [15, 22],  # Example
        'Genetic_Diversity': [8.1, 6.5], # Example - Ensure names match columns in X
        # Add *ALL* other columns present in X (before preprocessing) with dummy/NaN values
    }

    # --- Dynamically ensure all columns from training X are present ---
    if 'preprocessor' in loaded_model:
        try:
            # Get expected feature names from the fitted preprocessor
            if hasattr(loaded_model['preprocessor'], 'feature_names_in_'):
                expected_cols = loaded_model['preprocessor'].feature_names_in_
            else: # Fallback if feature_names_in_ not available (older sklearn?)
                  # Infer from transformers if possible - this is more complex
                  # For now, assume X used for fitting is available or handle error
                 print("Warning: Cannot automatically determine expected columns from preprocessor.")
                 # You might need to manually define expected_cols based on your training X here
                 expected_cols = list(X.columns) # Assuming X from Cell 4 is available (less robust)

            print(f"\nPreprocessor expects {len(expected_cols)} columns.")
            dummy_df = pd.DataFrame(dummy_data) # Create initial DF

            missing_in_dummy = [col for col in expected_cols if col not in dummy_df.columns]
            if missing_in_dummy:
                print(f"Adding missing expected columns to dummy data: {missing_in_dummy}")
                for col in missing_in_dummy:
                    dummy_df[col] = np.nan # Add missing columns with NaN

            # Select and potentially reorder columns to match preprocessor's expectation
            try:
                dummy_df = dummy_df[expected_cols]
                print("Dummy data columns aligned with preprocessor expectations.")
            except KeyError as e_key:
                 print(f"KeyError aligning dummy data columns: {e_key}. Check column names.")
                 dummy_df = None # Prevent prediction if alignment fails

        except Exception as e_cols:
             print(f"Error preparing dummy data columns: {e_cols}")
             dummy_df = None
    else:
         print("ERROR: Preprocessor not found in loaded model components.")
         dummy_df = None

    # --- Run Prediction if dummy data is ready ---
    if dummy_df is not None:
        predictions = predict_cattle_compatibility(dummy_df, loaded_model)

        if predictions:
            print("\n--- Predictions for Dummy Data ---")
            results_df = pd.DataFrame(predictions)
            print(results_df.to_string()) # Print full DataFrame results
            # for i, result in enumerate(predictions):
            #     print(f"Pair {i+1}:")
            #     print(f"  Prediction: {result['Prediction']}")
            #     print(f"  Percentage Score: {result['Confidence_Score_Percent']}%")
            #     print(f"  Predicted Raw CCS: {result['Raw_CCS_Score']}")
            #     print("-" * 10)
        else:
            print("Prediction function returned None (failed).")
    else:
         print("Dummy data preparation failed, skipping prediction.")

else:
    print("Could not load model components to run prediction example.")


--- Loading saved model and predicting on dummy data ---
Model components loaded successfully from /content/drive/MyDrive/MyModels/cattle_predictor_v2.pkl

Preprocessor expects 34 columns.
Dummy data columns aligned with preprocessor expectations.

--- Predictions for Dummy Data ---
  Prediction  Confidence_Score_Percent  Raw_CCS_Score
0        Yes                     63.28          35.42
1        Yes                     57.22          27.25




In [20]:
# @title 9. Evaluate Final Models

# --- This section calculates and prints the Classifier's Accuracy ---
print("\n--- Evaluating Final CLASSIFIER on Test Set (Selected Features) ---")

# 1. Predict class labels (0 or 1) on the test set using the selected features
y_pred_class = final_classifier.predict(X_test_selected)

# 2. Calculate accuracy by comparing predictions (y_pred_class) to the true labels (y_test_class)
accuracy = accuracy_score(y_test_class, y_pred_class)

# 3. Calculate F1-score (another useful classification metric)
f1 = f1_score(y_test_class, y_pred_class, average='weighted')

# 4. Print the results
print(f"Classifier Accuracy: {accuracy:.4f}") # <-- THIS IS THE ACCURACY ON THE TEST SET
print(f"Classifier Weighted F1-Score: {f1:.4f}")

# Optional detailed report (currently commented out)
# from sklearn.metrics import classification_report, confusion_matrix
# print(classification_report(y_test_class, y_pred_class))
# print(confusion_matrix(y_test_class, y_pred_class))


# --- This section evaluates the Regressor (predicting the score) ---
print("\n--- Evaluating Final REGRESSOR on Test Set (Selected Features) ---")
# (Code for R², RMSE, MAE follows...)


--- Evaluating Final CLASSIFIER on Test Set (Selected Features) ---
Classifier Accuracy: 0.8285
Classifier Weighted F1-Score: 0.8217

--- Evaluating Final REGRESSOR on Test Set (Selected Features) ---


In [21]:
# @title 11. Example Usage (with Added Sample Pair)

import pandas as pd # Ensure pandas is imported in this scope if needed
import numpy as np  # Ensure numpy is imported

print("\n--- Loading saved model and predicting on dummy data ---")
loaded_model = load_combined_model(model_save_path) # Assumes model_save_path is defined

if loaded_model:
    # --- Define the specific sample pair ---
    sample_pair = {
        'Cow': { 'Breed': 'Gir', 'Age': 6, 'Weight': 450, 'Height': 140, 'Milk_Yield': 8,
                 'Health_Status': 0, 'Drought_Resistance': 70, 'Temperament': 'Calm',
                 # Add other Cow keys expected by X, even if None/NaN in this sample
                 'Genetic_Diversity_Score': np.nan, 'Fertility_Rate': np.nan, 'Breeding_Success_Rate': np.nan,
                 'Disease_Resistance_Score': np.nan, 'Market_Value': np.nan, 'Mother_Milk_Yield': np.nan,
                 'Disease': np.nan, 'Past_Breeding_Success': np.nan, 'Same_Parents': np.nan
               },
        'Bull': {'Breed': 'Jersey', 'Age': 7, 'Weight': 470, 'Height': 142, 'Health_Status': 0,
                 'Mother_Milk_Yield': 9, 'Drought_Resistance': 75, 'Temperament': 'Calm',
                 # Add other Bull keys expected by X, even if None/NaN in this sample
                 'Milk_Yield': np.nan, 'Genetic_Diversity_Score': np.nan, 'Fertility_Rate': np.nan,
                 'Breeding_Success_Rate': np.nan, 'Disease_Resistance_Score': np.nan, 'Market_Value': np.nan,
                 'Disease': np.nan, 'Past_Breeding_Success': np.nan, 'Same_Parents': np.nan
                },
        # Top-level keys expected by X
        'Same_Parents': 0,
        'Trait_Difference': 18,
        'Genetic_Diversity': 8, # This might override individual scores depending on how X was defined
        'Fertility_Rate': 65,
        'Breeding_Success_Rate': 55,
        'Disease_Resistance_Score': 6.5,
        'Market_Value': 25000,
        'Past_Breeding_Success': 'High'
    }

    # --- Flatten the sample_pair into a dictionary matching DataFrame columns ---
    flat_sample = {}
    for prefix, inner_dict in sample_pair.items():
        if isinstance(inner_dict, dict):
            for key, value in inner_dict.items():
                flat_sample[f"{prefix}_{key}"] = value
        else:
            # Handle top-level keys directly
            flat_sample[prefix] = inner_dict

    # Convert the flattened sample to a DataFrame row
    new_pair_df = pd.DataFrame([flat_sample])
    print("Flattened sample pair prepared.")


    # --- Create original dummy data ---
    dummy_data = {
        'Cow_Breed': ['Angus', 'Holstein', 'UnknownBreed'], 'Cow_Age': [5, 6, 7], 'Cow_Weight': [550.0, 600.0, 580.0],
        'Cow_Height': [130.0, 140.0, 135.0], 'Cow_Milk_Yield': [8.5, 9.0, np.nan], 'Cow_Health_Status': [0, 1, 0],
        'Cow_Genetic_Diversity_Score': [7.5, 8.1, 7.0], 'Cow_Fertility_Rate': [60.0, 65.0, 55.0],
        'Cow_Breeding_Success_Rate': [50.0, 55.0, 45.0], 'Cow_Drought_Resistance': [70.0, 60.0, 65.0],
        'Cow_Disease_Resistance_Score': [6.0, 7.0, 5.5], 'Cow_Market_Value': [15000, 18000, 16000],
        'Cow_Temperament': ['Calm', 'Calm', 'Aggressive'], 'Cow_Mother_Milk_Yield': [7.0, 7.5, 6.5],
        'Cow_Disease': ['FootRot', 'Mastitis', np.nan], 'Cow_Past_Breeding_Success': ['Moderate', 'High', 'Low'],
        'Cow_Same_Parents': [0, 0, 1], # Note: This might conflict with top-level Same_Parents if kept
        'Bull_Breed': ['Brahman', 'Angus', 'Brahman'], 'Bull_Age': [4, 5, 6], 'Bull_Weight': [650.0, 680.0, 700.0],
        'Bull_Height': [145.0, 150.0, 155.0], 'Bull_Milk_Yield': [np.nan, np.nan, np.nan],
        'Bull_Health_Status': [0, 0, 1], 'Bull_Genetic_Diversity_Score': [8.0, 7.8, 7.5],
        'Bull_Fertility_Rate': [70.0, 75.0, 68.0], 'Bull_Breeding_Success_Rate': [60.0, 65.0, 58.0],
        'Bull_Drought_Resistance': [80.0, 50.0, 75.0], 'Bull_Disease_Resistance_Score': [7.5, 6.5, 7.0],
        'Bull_Market_Value': [20000, 22000, 21000], 'Bull_Temperament': ['Aggressive', 'Calm', 'Calm'],
        'Bull_Mother_Milk_Yield': [np.nan, np.nan, np.nan], 'Bull_Disease': ['None', 'BLV', 'FootRot'],
        'Bull_Past_Breeding_Success': ['High', 'Moderate', 'Moderate'],
        'Bull_Same_Parents': [0, 1, 0], # Note: This might conflict with top-level Same_Parents if kept
        # --- Top-level combined features (as used in Feature Engineering) ---
        'Same_Parents': [0, 1, 0], # Example values matching number of rows
        'Trait_Difference': [15, 25, 10],
        'Genetic_Diversity': [8, 6, 7.5],
        'Fertility_Rate': [65, 70, 60], # Example values
        'Breeding_Success_Rate': [55, 60, 50], # Example values
        'Disease_Resistance_Score': [6.5, 7.0, 6.0], # Example values
        'Market_Value': [25000, 19000, 23000], # Example values
        # Add *ALL* other columns present in X (before preprocessing)
    }
    original_dummy_df = pd.DataFrame(dummy_data)

    # --- Concatenate the original dummy data and the new sample pair ---
    combined_dummy_df = pd.concat([original_dummy_df, new_pair_df], ignore_index=True)
    print(f"Combined dummy data shape: {combined_dummy_df.shape}")


    # --- Dynamically ensure all columns from training X are present ---
    final_dummy_df = None # Initialize
    if 'preprocessor' in loaded_model:
        try:
            # Get expected feature names from the fitted preprocessor
            if hasattr(loaded_model['preprocessor'], 'feature_names_in_'):
                expected_cols = loaded_model['preprocessor'].feature_names_in_
            else:
                 print("Warning: Cannot automatically determine expected columns from preprocessor. Using columns from Cell 4's X.")
                 # This assumes 'X' from cell 4 is still available and correct. It's less robust.
                 if 'X' in globals():
                     expected_cols = list(X.columns)
                 else:
                     raise ValueError("Original X dataframe not available to determine expected columns.")

            print(f"\nPreprocessor expects {len(expected_cols)} columns for prediction.")

            # Check for missing columns in the combined dummy data
            current_dummy_cols = combined_dummy_df.columns
            missing_in_dummy = [col for col in expected_cols if col not in current_dummy_cols]
            if missing_in_dummy:
                print(f"Adding missing expected columns to combined dummy data: {missing_in_dummy}")
                for col in missing_in_dummy:
                    combined_dummy_df[col] = np.nan # Add missing columns with NaN

            # Select and reorder columns to match preprocessor's expectation
            try:
                final_dummy_df = combined_dummy_df[expected_cols] # Ensure correct order and columns
                print("Combined dummy data columns aligned with preprocessor expectations.")
            except KeyError as e_key:
                 print(f"KeyError aligning dummy data columns: {e_key}. Check column names in dummy data and expected columns.")
                 final_dummy_df = None # Prevent prediction if alignment fails
            except Exception as e_align:
                 print(f"Error aligning dummy data columns: {e_align}")
                 final_dummy_df = None

        except Exception as e_cols:
             print(f"Error preparing dummy data columns: {e_cols}")
             final_dummy_df = None
    else:
         print("ERROR: Preprocessor not found in loaded model components.")
         final_dummy_df = None

    # --- Run Prediction if dummy data is ready ---
    if final_dummy_df is not None:
        predictions = predict_cattle_compatibility(final_dummy_df, loaded_model) # Use the final aligned df

        if predictions:
            print("\n--- Predictions for Dummy Data (including added sample) ---")
            results_df = pd.DataFrame(predictions)
            # Add an identifier column for clarity
            results_df['Source'] = ['Dummy'] * len(original_dummy_df) + ['Added Sample'] * len(new_pair_df)
            print(results_df.to_string()) # Print full DataFrame results
        else:
            print("Prediction function returned None (failed).")
    else:
         print("Dummy data preparation failed, skipping prediction.")

else:
    print("Could not load model components to run prediction example.")


--- Loading saved model and predicting on dummy data ---
Model components loaded successfully from /content/drive/MyDrive/MyModels/cattle_predictor_v2.pkl
Flattened sample pair prepared.
Combined dummy data shape: (4, 42)

Preprocessor expects 34 columns for prediction.
Combined dummy data columns aligned with preprocessor expectations.

--- Predictions for Dummy Data (including added sample) ---
  Prediction  Confidence_Score_Percent  Raw_CCS_Score        Source
0        Yes                     63.28          35.42         Dummy
1        Yes                     62.02          33.72         Dummy
2        Yes                     55.60          25.06         Dummy
3        Yes                     61.05          32.42  Added Sample




In [22]:
# @title 11. Example Usage (with Added Sample Pair & More Detail)

import pandas as pd # Ensure pandas is imported in this scope if needed
import numpy as np  # Ensure numpy is imported

print("\n--- Loading saved model and predicting on dummy data ---")
# Assumes model_save_path is defined in Cell 10 and points to the correct saved file
loaded_model = load_combined_model(model_save_path)

if loaded_model:
    # --- Define Dummy Data (including one with Same_Parents=1) ---
    # Ensure keys match the column names in the *original X dataframe* (before FE/preprocessing)
    dummy_data_list = [
        # Dummy Entry 1 (Different Parents)
        {
            'Cow_Breed': 'Angus', 'Cow_Age': 5, 'Cow_Weight': 550.0, 'Cow_Height': 130.0,
            'Cow_Milk_Yield': 8.5, 'Cow_Health_Status': 0, 'Cow_Genetic_Diversity_Score': 7.5,
            'Cow_Fertility_Rate': 60.0, 'Cow_Breeding_Success_Rate': 50.0, 'Cow_Drought_Resistance': 70.0,
            'Cow_Disease_Resistance_Score': 6.0, 'Cow_Market_Value': 15000, 'Cow_Temperament': 'Calm',
            'Cow_Mother_Milk_Yield': 7.0, 'Cow_Disease': 'FootRot', 'Cow_Past_Breeding_Success': 'Moderate',
            'Bull_Breed': 'Brahman', 'Bull_Age': 4, 'Bull_Weight': 650.0, 'Bull_Height': 145.0,
            'Bull_Milk_Yield': np.nan, 'Bull_Health_Status': 0, 'Bull_Genetic_Diversity_Score': 8.0,
            'Bull_Fertility_Rate': 70.0, 'Bull_Breeding_Success_Rate': 60.0, 'Bull_Drought_Resistance': 80.0,
            'Bull_Disease_Resistance_Score': 7.5, 'Bull_Market_Value': 20000, 'Bull_Temperament': 'Aggressive',
            'Bull_Mother_Milk_Yield': 8.8, 'Bull_Disease': 'None', 'Bull_Past_Breeding_Success': 'High',
            # --- Top-level/Combined Features ---
            'Same_Parents': 0, # Different parents
            'Trait_Difference': 15,
            'Genetic_Diversity': 8.1, # Example overall score
            'Fertility_Rate': 65, # Example overall score
            'Breeding_Success_Rate': 55, # Example overall score
            'Disease_Resistance_Score': 6.8, # Example overall score
            'Market_Value': 17500 # Example overall score - Note: Model might use Cow/Bull MV instead if selected
        },
        # Dummy Entry 2 (Same Parents)
        {
            'Cow_Breed': 'Holstein', 'Cow_Age': 7, 'Cow_Weight': 610.0, 'Cow_Height': 138.0,
            'Cow_Milk_Yield': 7.0, 'Cow_Health_Status': 1, 'Cow_Genetic_Diversity_Score': 6.8,
            'Cow_Fertility_Rate': 75.0, 'Cow_Breeding_Success_Rate': 65.0, 'Cow_Drought_Resistance': 55.0,
            'Cow_Disease_Resistance_Score': 7.2, 'Cow_Market_Value': 21000, 'Cow_Temperament': 'Aggressive',
            'Cow_Mother_Milk_Yield': 8.1, 'Cow_Disease': 'None', 'Cow_Past_Breeding_Success': 'High',
            'Bull_Breed': 'Angus', 'Bull_Age': 6, 'Bull_Weight': 710.0, 'Bull_Height': 152.0,
            'Bull_Milk_Yield': np.nan, 'Bull_Health_Status': 0, 'Bull_Genetic_Diversity_Score': 7.1,
            'Bull_Fertility_Rate': 65.0, 'Bull_Breeding_Success_Rate': 55.0, 'Bull_Drought_Resistance': 60.0,
            'Bull_Disease_Resistance_Score': 6.8, 'Bull_Market_Value': 19000, 'Bull_Temperament': 'Calm',
            'Bull_Mother_Milk_Yield': 7.2, 'Bull_Disease': 'BLV', 'Bull_Past_Breeding_Success': 'Low',
            # --- Top-level/Combined Features ---
            'Same_Parents': 1, # Same parents
            'Trait_Difference': 22,
            'Genetic_Diversity': 6.5,
            'Fertility_Rate': 70,
            'Breeding_Success_Rate': 60,
            'Disease_Resistance_Score': 7.0,
            'Market_Value': 20000
        },
         # Dummy Entry 3 (Missing values test)
        {
            'Cow_Breed': 'Brahman', 'Cow_Age': 8, 'Cow_Weight': 580.0, 'Cow_Height': 135.0,
            'Cow_Milk_Yield': np.nan, 'Cow_Health_Status': 0, 'Cow_Genetic_Diversity_Score': 7.0,
            'Cow_Fertility_Rate': 55.0, 'Cow_Breeding_Success_Rate': 45.0, 'Cow_Drought_Resistance': 65.0,
            'Cow_Disease_Resistance_Score': 5.5, 'Cow_Market_Value': 16000, 'Cow_Temperament': 'Calm',
            'Cow_Mother_Milk_Yield': 6.5, 'Cow_Disease': np.nan, 'Cow_Past_Breeding_Success': 'Low',
            'Bull_Breed': 'Gir', 'Bull_Age': 5, 'Bull_Weight': np.nan, 'Bull_Height': 148.0,
            'Bull_Milk_Yield': np.nan, 'Bull_Health_Status': 1, 'Bull_Genetic_Diversity_Score': 7.9,
            'Bull_Fertility_Rate': 72.0, 'Bull_Breeding_Success_Rate': 62.0, 'Bull_Drought_Resistance': 70.0,
            'Bull_Disease_Resistance_Score': np.nan, 'Bull_Market_Value': 23000, 'Bull_Temperament': 'Calm',
            'Bull_Mother_Milk_Yield': 8.0, 'Bull_Disease': 'None', 'Bull_Past_Breeding_Success': 'Moderate',
            # --- Top-level/Combined Features ---
            'Same_Parents': 0,
            'Trait_Difference': np.nan,
            'Genetic_Diversity': 7.2,
            'Fertility_Rate': 63.5,
            'Breeding_Success_Rate': 53.5,
            'Disease_Resistance_Score': 6.0, # Example overall score
            'Market_Value': 19500
        }
    ]
    # Note: The Feature Engineered columns (FE_...) are NOT included here;
    # they should be calculated from the base columns if needed by the model
    # (They were added to X in Cell 4/5 before preprocessing in the training script)

    # Create DataFrame from the list of dictionaries
    combined_dummy_df = pd.DataFrame(dummy_data_list)
    print(f"Dummy data created. Shape: {combined_dummy_df.shape}")

    # --- Dynamically ensure all columns from training X are present ---
    final_dummy_df = None # Initialize
    if 'preprocessor' in loaded_model:
        try:
            # Get expected feature names from the fitted preprocessor
            if hasattr(loaded_model['preprocessor'], 'feature_names_in_'):
                expected_cols = loaded_model['preprocessor'].feature_names_in_
            else:
                 print("Warning: Cannot automatically determine expected columns from preprocessor. Using columns from Cell 4's X.")
                 if 'X' in globals(): expected_cols = list(X.columns)
                 else: raise ValueError("Original X dataframe not available.")

            print(f"\nPreprocessor expects {len(expected_cols)} columns.")
            # print("Expected columns:", expected_cols) # Uncomment to debug

            # Check for missing columns in the combined dummy data
            current_dummy_cols = combined_dummy_df.columns
            missing_in_dummy = [col for col in expected_cols if col not in current_dummy_cols]
            if missing_in_dummy:
                print(f"Adding missing expected columns to dummy data: {missing_in_dummy}")
                for col in missing_in_dummy:
                    # Add missing columns with NaN (imputer will handle these)
                    combined_dummy_df[col] = np.nan

            # Ensure columns are in the correct order
            try:
                # Select only the expected columns in the correct order
                final_dummy_df = combined_dummy_df[expected_cols]
                print("Dummy data columns aligned with preprocessor expectations.")
                # print("Final dummy columns:", final_dummy_df.columns.tolist()) # Uncomment to debug
            except KeyError as e_key:
                 print(f"KeyError aligning dummy data columns: {e_key}. Check names.")
                 final_dummy_df = None
            except Exception as e_align:
                 print(f"Error aligning dummy data columns: {e_align}")
                 final_dummy_df = None

        except Exception as e_cols:
             print(f"Error preparing dummy data columns: {e_cols}")
             final_dummy_df = None
    else:
         print("ERROR: Preprocessor not found in loaded model components.")
         final_dummy_df = None

    # --- Run Prediction if dummy data is ready ---
    if final_dummy_df is not None:
        predictions = predict_cattle_compatibility(final_dummy_df, loaded_model) # Use the final aligned df

        if predictions:
            print("\n--- Predictions for Dummy Data ---")
            results_df = pd.DataFrame(predictions)
            # Add index for clarity if needed
            # results_df.index = [f"Pair {i+1}" for i in range(len(results_df))]
            print(results_df.to_string()) # Print full DataFrame results
        else:
            print("Prediction function returned None (failed).")
    else:
         print("Dummy data preparation failed, skipping prediction.")

else:
    print("Could not load model components to run prediction example.")


--- Loading saved model and predicting on dummy data ---
Model components loaded successfully from /content/drive/MyDrive/MyModels/cattle_predictor_v2.pkl
Dummy data created. Shape: (3, 39)

Preprocessor expects 34 columns.
Adding missing expected columns to dummy data: ['Cow_Same_Parents', 'Bull_Same_Parents']
Dummy data columns aligned with preprocessor expectations.

--- Predictions for Dummy Data ---
  Prediction  Confidence_Score_Percent  Raw_CCS_Score
0        Yes                     59.70          30.60
1        Yes                     57.22          27.25
2        Yes                     55.90          25.47




In [24]:
# @title 11. Example Usage (with Multiple Added Sample Pairs)

import pandas as pd # Ensure pandas is imported
import numpy as np  # Ensure numpy is imported

print("\n--- Loading saved model and predicting on dummy data ---")
# Assumes model_save_path is defined in Cell 10 and points to the correct saved file
loaded_model = load_combined_model(model_save_path)

if loaded_model:
    # --- Define the specific sample pairs ---

    # Pair 1: "Bad Pair" (from previous request)
    pair1_bad_data = {
        'Cow_Breed': 'Sahiwal', 'Cow_Age': 4, 'Cow_Weight': 350, 'Cow_Height': 130,
        'Cow_Milk_Yield': 4, 'Cow_Health_Status': 2, 'Cow_Temperament': 'Aggressive',
        'Cow_Genetic_Diversity_Score': 3, 'Cow_Fertility_Rate': 50, 'Cow_Breeding_Success_Rate': 45,
        'Cow_Drought_Resistance': 80, 'Cow_Disease_Resistance_Score': 4, 'Cow_Market_Value': 15000,
        'Cow_Mother_Milk_Yield': np.nan, 'Cow_Disease': np.nan, 'Cow_Past_Breeding_Success': 'Low',
        'Bull_Breed': 'Sahiwal', 'Bull_Age': 15, 'Bull_Weight': 550, 'Bull_Height': 160,
        'Bull_Milk_Yield': np.nan, 'Bull_Health_Status': 1, 'Bull_Temperament': 'Aggressive',
        'Bull_Mother_Milk_Yield': 3, 'Bull_Genetic_Diversity_Score': 3, 'Bull_Fertility_Rate': 50,
        'Bull_Breeding_Success_Rate': 45, 'Bull_Drought_Resistance': 80,
        'Bull_Disease_Resistance_Score': 4, 'Bull_Market_Value': 15000, 'Bull_Disease': np.nan,
        'Bull_Past_Breeding_Success': 'Low',
        'Same_Parents': 1, 'Trait_Difference': 40, 'Genetic_Diversity': 3, 'Fertility_Rate': 50,
        'Breeding_Success_Rate': 45, 'Disease_Resistance_Score': 4, 'Market_Value': 15000,
        'Past_Breeding_Success': 'Low'
        # Add ALL other columns expected by X with np.nan if not specified above
    }
    pair1_df = pd.DataFrame([pair1_bad_data])

    # Pair 2: Very Bad Pair (Same Parents, Sick, Low Stats)
    pair2_data = {
        'Cow_Breed': 'Jersey', 'Cow_Age': 3, 'Cow_Weight': 300, 'Cow_Height': 125,
        'Cow_Milk_Yield': 3, 'Cow_Health_Status': 2, 'Cow_Temperament': 'Aggressive',
        'Cow_Genetic_Diversity_Score': 2, 'Cow_Fertility_Rate': 45, 'Cow_Breeding_Success_Rate': 40,
        'Cow_Drought_Resistance': 20, 'Cow_Disease_Resistance_Score': 3, 'Cow_Market_Value': 12000,
        'Cow_Mother_Milk_Yield': np.nan, 'Cow_Disease': np.nan, 'Cow_Past_Breeding_Success': 'Low',
        'Bull_Breed': 'Jersey', 'Bull_Age': 10, 'Bull_Weight': 500, 'Bull_Height': 155,
        'Bull_Milk_Yield': np.nan, 'Bull_Health_Status': 2, 'Bull_Temperament': 'Aggressive',
        'Bull_Mother_Milk_Yield': 4, 'Bull_Genetic_Diversity_Score': 2, 'Bull_Fertility_Rate': 45,
        'Bull_Breeding_Success_Rate': 40, 'Bull_Drought_Resistance': 20,
        'Bull_Disease_Resistance_Score': 3, 'Bull_Market_Value': 12000, 'Bull_Disease': np.nan,
        'Bull_Past_Breeding_Success': 'Low',
        'Same_Parents': 1, 'Trait_Difference': 35, 'Genetic_Diversity': 2, 'Fertility_Rate': 45,
        'Breeding_Success_Rate': 40, 'Disease_Resistance_Score': 3, 'Market_Value': 12000,
        'Past_Breeding_Success': 'Low'
        # Add ALL other columns expected by X with np.nan if not specified above
    }
    pair2_df = pd.DataFrame([pair2_data])

    # Pair 3: Different Parents, Mixed Health/Stats
    pair3_data = {
        'Cow_Breed': 'Holstein', 'Cow_Age': 7, 'Cow_Weight': 450, 'Cow_Height': 140,
        'Cow_Milk_Yield': 6, 'Cow_Health_Status': 1, 'Cow_Temperament': 'Neutral', # Handle 'Neutral' if needed
        'Cow_Genetic_Diversity_Score': 4, 'Cow_Fertility_Rate': 55, 'Cow_Breeding_Success_Rate': 35,
        'Cow_Drought_Resistance': 80, 'Cow_Disease_Resistance_Score': 4, 'Cow_Market_Value': 18000,
        'Cow_Mother_Milk_Yield': np.nan, 'Cow_Disease': np.nan, 'Cow_Past_Breeding_Success': 'Moderate',
        'Bull_Breed': 'Gir', 'Bull_Age': 14, 'Bull_Weight': 700, 'Bull_Height': 175,
        'Bull_Milk_Yield': np.nan, 'Bull_Health_Status': 2, 'Bull_Temperament': 'Aggressive',
        'Bull_Mother_Milk_Yield': 4, 'Bull_Genetic_Diversity_Score': 4, 'Bull_Fertility_Rate': 55,
        'Bull_Breeding_Success_Rate': 35, 'Bull_Drought_Resistance': 80,
        'Bull_Disease_Resistance_Score': 4, 'Bull_Market_Value': 18000, 'Bull_Disease': np.nan,
        'Bull_Past_Breeding_Success': 'Moderate',
        'Same_Parents': 0, 'Trait_Difference': 32, 'Genetic_Diversity': 4, 'Fertility_Rate': 55,
        'Breeding_Success_Rate': 35, 'Disease_Resistance_Score': 4, 'Market_Value': 18000,
        'Past_Breeding_Success': 'Moderate'
        # Add ALL other columns expected by X with np.nan if not specified above
    }
    # Handle 'Neutral' Temperament - map it to known category or NaN if encoder can't handle it
    if 'Cow_Temperament' in pair3_data and pair3_data['Cow_Temperament'] == 'Neutral':
        # Option 1: Map to 'Calm' or 'Aggressive' if appropriate
        # pair3_data['Cow_Temperament'] = 'Calm'
        # Option 2: Map to NaN so imputer makes it 'Missing'
         pair3_data['Cow_Temperament'] = np.nan
         print("Note: Mapping 'Neutral' temperament to NaN for preprocessing.")

    pair3_df = pd.DataFrame([pair3_data])


    # --- Create original dummy data (from previous example, keep maybe 1 good one) ---
    original_dummy_data_list = [
        # Dummy Entry 1 (Good, Different Parents)
        {
            'Cow_Breed': 'Angus', 'Cow_Age': 5, 'Cow_Weight': 550.0, 'Cow_Height': 130.0,
            'Cow_Milk_Yield': 8.5, 'Cow_Health_Status': 0, 'Cow_Genetic_Diversity_Score': 7.5,
            'Cow_Fertility_Rate': 60.0, 'Cow_Breeding_Success_Rate': 50.0, 'Cow_Drought_Resistance': 70.0,
            'Cow_Disease_Resistance_Score': 6.0, 'Cow_Market_Value': 15000, 'Cow_Temperament': 'Calm',
            'Cow_Mother_Milk_Yield': 7.0, 'Cow_Disease': 'FootRot', 'Cow_Past_Breeding_Success': 'Moderate',
            'Bull_Breed': 'Brahman', 'Bull_Age': 4, 'Bull_Weight': 650.0, 'Bull_Height': 145.0,
            'Bull_Milk_Yield': np.nan, 'Bull_Health_Status': 0, 'Bull_Genetic_Diversity_Score': 8.0,
            'Bull_Fertility_Rate': 70.0, 'Bull_Breeding_Success_Rate': 60.0, 'Bull_Drought_Resistance': 80.0,
            'Bull_Disease_Resistance_Score': 7.5, 'Bull_Market_Value': 20000, 'Bull_Temperament': 'Aggressive',
            'Bull_Mother_Milk_Yield': 8.8, 'Bull_Disease': 'None', 'Bull_Past_Breeding_Success': 'High',
            'Same_Parents': 0, 'Trait_Difference': 15, 'Genetic_Diversity': 8.1, 'Fertility_Rate': 65,
            'Breeding_Success_Rate': 55, 'Disease_Resistance_Score': 6.8, 'Market_Value': 17500,
            'Past_Breeding_Success': 'Moderate'
        },
    ]
    original_dummy_df = pd.DataFrame(original_dummy_data_list)

    # --- Concatenate the original dummy data and the new sample pairs ---
    combined_dummy_df = pd.concat([original_dummy_df, pair1_df, pair2_df, pair3_df], ignore_index=True)
    print(f"Combined dummy data shape (with all samples): {combined_dummy_df.shape}")


    # --- Dynamically ensure all columns from training X are present ---
    final_dummy_df = None # Initialize
    if 'preprocessor' in loaded_model:
        try:
            # Get expected feature names
            if hasattr(loaded_model['preprocessor'], 'feature_names_in_'):
                expected_cols = loaded_model['preprocessor'].feature_names_in_
            else:
                 print("Warning: Cannot determine expected columns. Using columns from Cell 4's X.")
                 if 'X' in globals(): expected_cols = list(X.columns)
                 else: raise ValueError("Original X dataframe not available.")

            print(f"\nPreprocessor expects {len(expected_cols)} columns.")

            # Check for missing columns
            current_dummy_cols = combined_dummy_df.columns
            missing_in_dummy = [col for col in expected_cols if col not in current_dummy_cols]
            if missing_in_dummy:
                print(f"Adding missing expected columns: {missing_in_dummy}")
                for col in missing_in_dummy:
                    combined_dummy_df[col] = np.nan

            # Align columns
            try:
                final_dummy_df = combined_dummy_df[expected_cols]
                print("Dummy data columns aligned.")
            except KeyError as e_key:
                 print(f"KeyError aligning dummy data columns: {e_key}. Check names.")
                 final_dummy_df = None
            except Exception as e_align:
                 print(f"Error aligning dummy data columns: {e_align}")
                 final_dummy_df = None
        except Exception as e_cols:
             print(f"Error preparing dummy data columns: {e_cols}")
             final_dummy_df = None
    else:
         print("ERROR: Preprocessor not found in loaded model.")
         final_dummy_df = None

    # --- Run Prediction ---
    if final_dummy_df is not None:
        predictions = predict_cattle_compatibility(final_dummy_df, loaded_model)

        if predictions:
            print("\n--- Predictions for Dummy Data (including added samples) ---")
            results_df = pd.DataFrame(predictions)
            # Add labels to identify the pairs
            num_original = len(original_dummy_df)
            num_pair1 = len(pair1_df)
            num_pair2 = len(pair2_df)
            num_pair3 = len(pair3_df) # Should be 1
            labels = (['Original Dummy'] * num_original +
                      ['Bad Pair 1'] * num_pair1 +
                      ['Bad Pair 2 (Same Parent)'] * num_pair2 +
                      ['Mixed Pair 3'] * num_pair3)
            results_df['Source'] = labels[:len(results_df)] # Ensure labels match length

            print(results_df.to_string())
        else:
            print("Prediction function returned None (failed).")
    else:
         print("Dummy data preparation failed.")

else:
    print("Could not load model components.")


--- Loading saved model and predicting on dummy data ---
Model components loaded successfully from /content/drive/MyDrive/MyModels/cattle_predictor_v2.pkl
Note: Mapping 'Neutral' temperament to NaN for preprocessing.
Combined dummy data shape (with all samples): (4, 40)

Preprocessor expects 34 columns.
Adding missing expected columns: ['Cow_Same_Parents', 'Bull_Same_Parents']
Dummy data columns aligned.

--- Predictions for Dummy Data (including added samples) ---
  Prediction  Confidence_Score_Percent  Raw_CCS_Score                    Source
0        Yes                     59.70          30.60            Original Dummy
1         No                     50.08          17.61                Bad Pair 1
2         No                     51.06          18.93  Bad Pair 2 (Same Parent)
3         No                     50.83          18.62              Mixed Pair 3


