## Setting up and Installing Dependencies

In [1]:
# ✅ Install Required Libraries
!pip install ultralytics opencv-python-headless pandas numpy scikit-learn xgboost

# ✅ Import Libraries
import os
import glob
import cv2
import numpy as np
import pandas as pd
import torch
from ultralytics import YOLO
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import matplotlib.pyplot as plt




In [13]:
# ✅ Enable GPU in Google Colab
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


Using device: cuda


### Load Yolo11 Models from Google Drive

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
from ultralytics import YOLO
import os

# ✅ Set the correct model path
model_path = "/content/drive/MyDrive/cassava models/Models/"

# ✅ Verify that the files exist before loading
models = ["best_early.pt", "best_late.pt", "best_full.pt"]
for model_name in models:
    full_path = os.path.join(model_path, model_name)
    if not os.path.exists(full_path):
        raise FileNotFoundError(f"Error: {full_path} not found. Check if the file is correctly uploaded!")

# ✅ Load YOLO Models with Corrected Path
model_early = YOLO(model_path + "best_early.pt").to('cuda')
model_late = YOLO(model_path + "best_late.pt").to('cuda')
model_full = YOLO(model_path + "best_full.pt").to('cuda')

# ✅ Choose the model to use
selected_model = model_full  # Change if needed

print("✅ Models loaded successfully!")


✅ Models loaded successfully!


In [None]:
# ✅ Paths to Training & Testing Data
data_path = "/content/drive/MyDrive/cassava models/data/"
train_image_dir = os.path.join(data_path, "train/A6dzrkjqvl")  # Folder containing training images
test_image_dir = os.path.join(data_path, "test/A6dzrkjqvl")    # Folder containing test images
train_label_path = os.path.join(data_path, "Train.csv")  # CSV with 'image_id' & 'RootVolume'

# ✅ Confirm Paths Exist
print("Checking Directories:")
print("Model Path Exists:", os.path.exists(model_path))
print("Train Image Directory Exists:", os.path.exists(train_image_dir))
print("Test Image Directory Exists:", os.path.exists(test_image_dir))
print("Train Labels CSV Exists:", os.path.exists(train_label_path))

# ✅ List some sample files
print("Sample Training Images:", os.listdir(train_image_dir)[:5])  # Show first 5 images
print("Sample Test Images:", os.listdir(test_image_dir)[:5])  # Show first 5 images


Checking Directories:
Model Path Exists: True
Train Image Directory Exists: True
Test Image Directory Exists: True
Train Labels CSV Exists: True
Sample Training Images: ['A6dzrkjqvl_L_039.png', 'A6dzrkjqvl_L_027.png', 'A6dzrkjqvl_L_033.png', 'A6dzrkjqvl_L_018.png', 'A6dzrkjqvl_L_003.png']
Sample Test Images: ['A6dzrkjqvl_L_080.png', 'A6dzrkjqvl_L_046.png', 'A6dzrkjqvl_L_074.png', 'A6dzrkjqvl_L_026.png', 'A6dzrkjqvl_L_009.png']


In [None]:
import glob

# ✅ Get List of Image Paths (Recursively from Subfolders)
train_images = sorted(glob.glob(os.path.join(train_image_dir, "**", "*.png"), recursive=True))
test_images = sorted(glob.glob(os.path.join(test_image_dir, "**", "*.png"), recursive=True))

# ✅ Check if images are now detected
print(f"✅ Found {len(train_images)} training images and {len(test_images)} test images.")


✅ Found 164 training images and 164 test images.


In [None]:
def extract_features(model, image_paths):
    """
    Extracts features from images using YOLOv11 for instance segmentation.
    Features:
      - num_detections: Number of root segments detected.
      - total_area: Sum of bounding box areas.
      - avg_size: Average bounding box size per detection.
    """
    feature_list = []
    for img_path in image_paths:
        img = cv2.imread(img_path)
        results = model.predict(img, conf=0.25)

        # Extract Bounding Box Features
        num_detections = len(results[0].boxes)
        total_area = sum((box.xyxy[0][2] - box.xyxy[0][0]) * (box.xyxy[0][3] - box.xyxy[0][1]) for box in results[0].boxes)
        avg_size = total_area / num_detections if num_detections > 0 else 0

        # Store Features
        feature_list.append({
            "image_id": os.path.basename(img_path).replace(".jpg", ""),
            "num_detections": num_detections,
            "total_area": total_area,
            "avg_size": avg_size
        })

    return pd.DataFrame(feature_list)

# ✅ Extract Features from Training & Testing Data
print("Extracting features from training images...")
train_features = extract_features(selected_model, train_images)

print("Extracting features from test images...")
test_features = extract_features(selected_model, test_images)


Extracting features from training images...

0: 32x640 3 roots, 29.8ms
Speed: 2.9ms preprocess, 29.8ms inference, 2.3ms postprocess per image at shape (1, 3, 32, 640)

0: 32x640 2 roots, 68.4ms
Speed: 0.9ms preprocess, 68.4ms inference, 5.8ms postprocess per image at shape (1, 3, 32, 640)

0: 32x640 2 roots, 24.1ms
Speed: 2.4ms preprocess, 24.1ms inference, 1.8ms postprocess per image at shape (1, 3, 32, 640)

0: 32x640 (no detections), 24.7ms
Speed: 0.9ms preprocess, 24.7ms inference, 0.4ms postprocess per image at shape (1, 3, 32, 640)

0: 32x640 (no detections), 25.0ms
Speed: 0.7ms preprocess, 25.0ms inference, 0.4ms postprocess per image at shape (1, 3, 32, 640)

0: 32x640 (no detections), 23.7ms
Speed: 1.8ms preprocess, 23.7ms inference, 0.4ms postprocess per image at shape (1, 3, 32, 640)

0: 32x640 1 root, 23.6ms
Speed: 1.8ms preprocess, 23.6ms inference, 1.7ms postprocess per image at shape (1, 3, 32, 640)

0: 32x640 (no detections), 24.0ms
Speed: 1.4ms preprocess, 24.0ms infer

# Sort, Filter and Merge Training Data

In [None]:
# ✅ Load Training Labels
train_labels = pd.read_csv(train_label_path)

# ✅ Remove duplicate column names if present
train_labels = train_labels.loc[:, ~train_labels.columns.duplicated()]

# ✅ Rename 'FolderName' to 'image_id' to match extracted features
train_labels.rename(columns={'FolderName': 'image_id'}, inplace=True)

# ✅ Ensure 'image_id' is unique before merging
train_labels = train_labels.drop_duplicates(subset=["image_id"])

# ✅ Print train_labels sample
print("Sample train_labels after renaming:")
print(train_labels.head())

# ✅ Merge Features with Labels
train_data = pd.merge(train_features, train_labels, on="image_id", how="inner")

# ✅ Check if merging was successful
print(f"✅ Matching IDs after fix: {len(train_data)}")


Sample train_labels after renaming:
                 ID    image_id  PlantNumber Side  Start  End  RootVolume  \
0  ID_826322_Lbmaya  A6dzrkjqvl            3    L     33   42         0.9   
1  ID_718181_Bslpwx  Ypktwvqjbn            7    L     33   41         1.5   
2  ID_465762_L1n61d  Ox18ob0syv            4    R     21   28         2.7   
3  ID_626872_Pbmx2e  Hqcekwpxgu            2    R     30   39         2.6   
4  ID_518846_Opko8c  Ummqfuetoc            1    R     17   26         2.7   

             Genotype  Stage  
0  IITA-TMS-IBA000070  Early  
1           IBA154810  Early  
2           IBA980581  Early  
3  IITA-TMS-IBA000070  Early  
4           IBA980581  Early  
✅ Matching IDs after fix: 0


In [None]:
print("Unique image_id in train_features:", train_features["image_id"].nunique())
print("Unique image_id in train_labels:", train_labels["image_id"].nunique())

print("Matching IDs before merging:", len(set(train_labels["image_id"]).intersection(set(train_features["image_id"]))))


Unique image_id in train_features: 164
Unique image_id in train_labels: 98
Matching IDs before merging: 0


In [None]:
print("Sample image_id in train_features:")
print(train_features["image_id"].unique()[:5])  # First 5 unique IDs

print("Sample image_id in train_labels:")
print(train_labels["image_id"].unique()[:5])  # First 5 unique IDs


Sample image_id in train_features:
['A6dzrkjqvl_L_001.png' 'A6dzrkjqvl_L_002.png' 'A6dzrkjqvl_L_003.png' 'A6dzrkjqvl_L_004.png' 'A6dzrkjqvl_L_005.png']
Sample image_id in train_labels:
['A6dzrkjqvl' 'Ypktwvqjbn' 'Ox18ob0syv' 'Hqcekwpxgu' 'Ummqfuetoc']


In [None]:
import re

# ✅ Extract only Folder Name by removing `_L_###` and the file extension
train_features["image_id"] = train_features["image_id"].apply(lambda x: re.sub(r"_L_\d+", "", x))

# ✅ Convert to lowercase to avoid case mismatches
train_features["image_id"] = train_features["image_id"].str.lower()
train_labels["image_id"] = train_labels["image_id"].str.lower()

# ✅ Check if IDs match after fix
print("Matching IDs after fix:", len(set(train_labels["image_id"]).intersection(set(train_features["image_id"]))))


Matching IDs after fix: 0


In [None]:
import re

# ✅ Remove "_R_###" and file extensions from image_id
train_features["image_id"] = train_features["image_id"].apply(lambda x: re.sub(r"_r_\d+", "", x))
train_features["image_id"] = train_features["image_id"].apply(lambda x: os.path.splitext(x)[0])  # Remove .png/.jpg extensions

# ✅ Convert to lowercase to ensure consistency
train_features["image_id"] = train_features["image_id"].str.lower()
train_labels["image_id"] = train_labels["image_id"].str.lower()

# ✅ Check if IDs match after fix
print("Fixed train_features image_id (after removing _R_###):")
print(train_features["image_id"].unique()[:10])

print("Fixed train_labels image_id:")
print(train_labels["image_id"].unique()[:10])


Fixed train_features image_id (after removing _R_###):
['a6dzrkjqvl']
Fixed train_labels image_id:
['a6dzrkjqvl' 'ypktwvqjbn' 'ox18ob0syv' 'hqcekwpxgu' 'ummqfuetoc' 'b5myqsh1wi' 'l8w7zu7wek' 'izbgyxre0g' 'vinlgebupo' 'hc3b9gicdo']


In [None]:
print("Unique image_id in train_features:", train_features["image_id"].nunique())
print("Unique image_id in train_labels:", train_labels["image_id"].nunique())

# ✅ Confirm how many IDs match
matching_ids = len(set(train_labels["image_id"]).intersection(set(train_features["image_id"])))
print(f"✅ Matching IDs after fix: {matching_ids}")


Unique image_id in train_features: 1
Unique image_id in train_labels: 98
✅ Matching IDs after fix: 1


In [None]:
# ✅ Filter train_labels to include only 'a6dzrkjqvl'
train_labels = train_labels[train_labels["image_id"] == "a6dzrkjqvl"]

# ✅ Print to confirm the filtering
print("Filtered train_labels:")
print(train_labels.head())


Filtered train_labels:
                 ID    image_id  PlantNumber Side  Start  End  RootVolume  \
0  ID_826322_Lbmaya  a6dzrkjqvl            3    L     33   42         0.9   

             Genotype  Stage  
0  IITA-TMS-IBA000070  Early  


In [None]:
# ✅ Check if IDs match
matching_ids = len(set(train_labels["image_id"]).intersection(set(train_features["image_id"])))
print(f"✅ Matching IDs after filtering: {matching_ids}")


✅ Matching IDs after filtering: 1


In [None]:
# ✅ Merge filtered data
train_data = pd.merge(train_features, train_labels, on="image_id", how="inner")

# ✅ Print merge success message
print(f"✅ Successfully merged {len(train_data)} records for folder 'a6dzrkjqvl'.")


✅ Successfully merged 164 records for folder 'a6dzrkjqvl'.


In [None]:
# ✅ Select feature columns for training
X = train_data[["num_detections", "total_area", "avg_size"]]

# ✅ Target variable (RootVolume)
y = train_data["RootVolume"]

# ✅ Print dataset shapes
print(f"Features shape: {X.shape}, Target shape: {y.shape}")


Features shape: (164, 3), Target shape: (164,)


In [None]:
import torch

# ✅ Convert all tensor values to floats
train_data["total_area"] = train_data["total_area"].apply(lambda x: x.item() if isinstance(x, torch.Tensor) else x)
train_data["avg_size"] = train_data["avg_size"].apply(lambda x: x.item() if isinstance(x, torch.Tensor) else x)

# ✅ Verify Data Types Again
print(train_data.dtypes)


image_id           object
num_detections      int64
total_area        float64
avg_size          float64
ID                 object
PlantNumber         int64
Side               object
Start               int64
End                 int64
RootVolume        float64
Genotype           object
Stage              object
dtype: object


In [None]:
# Keep only numeric columns + image_id + RootVolume
train_data_fixed = train_data[["image_id", "RootVolume", "total_area", "avg_size"]].copy()

# Check final dataset
print(train_data_fixed.head())


     image_id  RootVolume   total_area    avg_size
0  a6dzrkjqvl         0.9  1930.466309  643.488770
1  a6dzrkjqvl         0.9  1522.300415  761.150208
2  a6dzrkjqvl         0.9  1482.281128  741.140564
3  a6dzrkjqvl         0.9     0.000000    0.000000
4  a6dzrkjqvl         0.9     0.000000    0.000000


# Train the Model

In [None]:
model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6)


In [None]:
print("Unique y_train values:", set(y_train))


NameError: name 'y_train' is not defined

In [None]:
print(y_train.value_counts())  # See how many times each value appears
print(y_train.describe())  # Check mean, min, and max


RootVolume
0.9    11
Name: count, dtype: int64
count    1.100000e+01
mean     9.000000e-01
std      2.328823e-16
min      9.000000e-01
25%      9.000000e-01
50%      9.000000e-01
75%      9.000000e-01
max      9.000000e-01
Name: RootVolume, dtype: float64


In [None]:
print(train_data_fixed["RootVolume"].value_counts())


RootVolume
0.9    164
Name: count, dtype: int64


In [None]:
X_train = X_train[(X_train["total_area"] > 0) & (X_train["avg_size"] > 0)]
y_train = y_train.loc[X_train.index]  # Keep matching y_train values


In [None]:
X_train["total_area"] = np.log1p(X_train["total_area"])  # log(1 + x)
X_train["avg_size"] = np.log1p(X_train["avg_size"])


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[["total_area", "avg_size"]] = scaler.fit_transform(X_train[["total_area", "avg_size"]])
X_val[["total_area", "avg_size"]] = scaler.transform(X_val[["total_area", "avg_size"]])


In [None]:

# Step 2: Initialize the Random Forest Regressor
# Random Forest Hyperparameters (Tune based on your data)
model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10, min_samples_split=10)

# Step 3: Fit the model to training data
model.fit(X_train, y_train)

# Step 4: Model Evaluation using Cross-Validation
cv_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error')

# Print Cross-Validation RMSE
print("Average RMSE from cross-validation: ", np.mean(np.sqrt(-cv_scores)))

# Step 5: Evaluate Model Performance
# Make predictions
y_pred = model.predict(X_test)

# Calculate RMSE for test data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE: ", rmse)

# Step 6: Visualize Predictions vs True Values
plt.scatter(y_test, y_pred)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('True vs Predicted Volume')
plt.show()


Average RMSE from cross-validation:  7.771561172376096e-16


NameError: name 'X_test' is not defined

In [None]:
# Predict and evaluate RMSE
y_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred) ** 0.5  # Manually take square root

print(f"✅ RMSE: {rmse:.4f}")


NotFittedError: need to call fit or load_model beforehand

In [None]:
print("Sample y_val:", y_val[:10].tolist())
print("Sample y_pred:", y_pred[:10].tolist())


Sample y_val: [0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]
Sample y_pred: [0.8999999761581421, 0.8999999761581421, 0.8999999761581421, 0.8999999761581421, 0.8999999761581421, 0.8999999761581421, 0.8999999761581421, 0.8999999761581421, 0.8999999761581421, 0.8999999761581421]


In [None]:
print(X_train.describe())


       total_area      avg_size
count   15.000000  1.500000e+01
mean     0.000000  1.480297e-17
std      1.035098  1.035098e+00
min     -2.155628 -2.310451e+00
25%     -0.014388  2.740422e-01
50%      0.475530  3.913931e-01
75%      0.614823  4.647092e-01
max      1.290258  9.796889e-01


# Tracking using Logs

In [None]:
import logging

# Set up logging
logging.basicConfig(filename="model_training.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Example logging during training
logging.info("Starting model training...")

for epoch in range(10):  # Example loop
    loss = 0.05 * epoch  # Simulated loss
    logging.info(f"Epoch {epoch+1}: Loss = {loss:.4f}")

logging.info("Training completed!")
