In [1]:
!pip install transformers 
!pip install protobuf==3.20.3

Collecting protobuf==3.20.3
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.33.0
    Uninstalling protobuf-6.33.0:
      Successfully uninstalled protobuf-6.33.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
onnx 1.18.0 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which is incompatible.
a2a-sdk 0.3.10 requires p

In [2]:
import pandas as pd 
import numpy as np 
import os 
import glob 
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import timm # For pre-trained models
from tqdm import tqdm # For progress bars
from PIL import Image



In [3]:
import torch
from transformers import AutoImageProcessor, AutoModel

# This is the path to the dataset you added in Step 1
# The exact path may vary slightly based on which dataset you chose.
MODEL_PATH = '/kaggle/input/dinov2/pytorch/small/1' 

# Load the processor from the local files
processor = AutoImageProcessor.from_pretrained(MODEL_PATH)

# Load the model from the local files
model = AutoModel.from_pretrained(MODEL_PATH)

model.eval()
print("DINOv2 model loaded successfully from local files.")

2025-11-18 10:15:23.123789: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763460923.320252      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763460923.379337      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


DINOv2 model loaded successfully from local files.


In [4]:
# --- Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



Using device: cpu


In [5]:
BASE_PATH='/kaggle/input/csiro-biomass'
train_meta=pd.read_csv(os.path.join(BASE_PATH,'train.csv'))

In [6]:
# 1. Pivot the target variables
# (Pivoting on 'image_id' is a bit cleaner than the full path)
targets_df = train_meta.pivot(index='image_path', 
                              columns='target_name', 
                              values='target')

In [7]:
# 2. Get the unique metadata for each image
# (State, Species, etc. are repeated, so we just grab the first instance)
meta_df = train_meta[[  'image_path', 'Sampling_Date', 'State', 'Species',
       'Pre_GSHH_NDVI', 'Height_Ave_cm']] \
                      .drop_duplicates(subset='image_path') \
                      .set_index('image_path')





In [8]:
# 3. Join them together
# This creates one clean row per image with all data
train_df = meta_df.join(targets_df).reset_index()

In [9]:
# 4. Define our 5 output targets
TARGET_COLS = list(train_df.columns[-5:])


# Weights: [Dry_Total_g, Dry_Green_g, Dry_Dead_g, Dry_Clover_g, Dry_Grass_g]
COMP_WEIGHTS = torch.tensor([0.5, 0.2, 0.1, 0.1,0.1]).to(device)

# Now train_df is ready!
print(train_df.head())

               image_path Sampling_Date State            Species  \
0  train/ID1011485656.jpg      2015/9/4   Tas    Ryegrass_Clover   
1  train/ID1012260530.jpg      2015/4/1   NSW            Lucerne   
2  train/ID1025234388.jpg      2015/9/1    WA  SubcloverDalkeith   
3  train/ID1028611175.jpg     2015/5/18   Tas           Ryegrass   
4  train/ID1035947949.jpg     2015/9/11   Tas           Ryegrass   

   Pre_GSHH_NDVI  Height_Ave_cm  Dry_Clover_g  Dry_Dead_g  Dry_Green_g  \
0           0.62         4.6667        0.0000     31.9984      16.2751   
1           0.55        16.0000        0.0000      0.0000       7.6000   
2           0.38         1.0000        6.0500      0.0000       0.0000   
3           0.66         5.0000        0.0000     30.9703      24.2376   
4           0.54         3.5000        0.4343     23.2239      10.5261   

   Dry_Total_g    GDM_g  
0      48.2735  16.2750  
1       7.6000   7.6000  
2       6.0500   6.0500  
3      55.2079  24.2376  
4      34.1844  

In [10]:
train_df_sample = train_df.iloc[:round(len(train_df)*0.8)]
val_df_sample = train_df.iloc[round(len(train_df)*0.8):]

In [11]:
val_df_sample.shape

(71, 11)

In [12]:
# First, you might need to install it:

import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2 # Albumentations uses OpenCV

# --- 1. Define your transforms ---
# These are applied ONLY to the training data
train_transforms = A.Compose([
    A.Resize(256, 256),            # Resize all images to 256x256
    A.HorizontalFlip(p=0.5),     # 50% chance to flip horizontally
    A.VerticalFlip(p=0.5),       # 50% chance to flip vertically
    A.RandomRotate90(p=0.5),     # 50% chance to rotate 90 degrees
    
    # Color/Distortion
    A.GaussNoise(p=0.2),
    A.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=0.5),
    
    # Normalize (use mean/std of ImageNet)
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    
    # Convert to PyTorch Tensor
    ToTensorV2()
])

# These are for validation. We ONLY resize and normalize. No flipping/distortion.
val_transforms = A.Compose([
    A.Resize(256, 256),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])


# --- 2. Update the Custom Dataset Class ---
class BiomassDataset(Dataset):
    def __init__(self, df, target_cols, base_path, processor,model,transforms=None,):
        self.df = df
        self.image_paths = [os.path.join(base_path, p) for p in df['image_path']]
        self.labels = df[target_cols].values.astype(np.float32)
        self.transforms = transforms # <-- NEW
        
        self.model=model
        self.processor=processor 
    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        feature_list=[]
        # Load image with OpenCV (used by albumentations)
        # cv2 loads as BGR, so we convert to RGB
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        labels = self.labels[idx]

        image_features=self.processor(image,return_tensors='pt')

        with torch.no_grad():
            model_out=self.model(**image_features)
            feature_list.append(model_out.pooler_output.cpu())
        
        # Apply transforms (if they exist)
        if self.transforms:
            # Albumentations returns a dictionary
            transformed = self.transforms(image=image)
            image = transformed['image']
        
        # Labels are already a numpy array, just convert to tensor
        labels = torch.tensor(labels)

        return image, labels,np.array(feature_list)


  A.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=0.5),


In [13]:

# --- 3. Create Datasets with new transforms ---
train_dataset = BiomassDataset(df=train_df_sample, 
                               target_cols=TARGET_COLS, 
                               base_path=BASE_PATH,
                               transforms=None,processor=processor,model=model) # <-- Pass transforms

val_dataset = BiomassDataset(df=val_df_sample, 
                             target_cols=TARGET_COLS, 
                             base_path=BASE_PATH,
                             transforms=None,processor=processor,model=model) # <-- Pass transforms

# DataLoaders are created the same way
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=2)



In [14]:
# Test it
images, labels,encodings = next(iter(train_loader))

print(f"Image batch shape: {images.shape}") # Should be [32, 3, 256, 256]
print(f"Labels batch shape: {labels.shape}") # Should be [32, 5]
print(f"Feature batch shape: {encodings.shape}")
# Image batch shape: torch.Size([32, 1000, 2000, 3])
# Labels batch shape: torch.Size([32, 5])
# Feature batch shape: torch.Size([32, 1, 1, 384])

Image batch shape: torch.Size([32, 1000, 2000, 3])
Labels batch shape: torch.Size([32, 5])
Feature batch shape: torch.Size([32, 1, 1, 384])


In [15]:
import numpy as np
import torch
from tqdm import tqdm

# Containers for the full dataset
all_features = []
all_labels = []

print("Extracting features from existing pipeline...")

for images, labels, encodings in tqdm(train_loader):
    
 
    # .view(batch_size, -1) flattens everything after the batch dimension
    batch_features = encodings.view(encodings.shape[0], -1).cpu().numpy()
    
    # 2. Convert labels to numpy
    batch_labels = labels.cpu().numpy()
    
    # 3. Collect
    all_features.append(batch_features)
    all_labels.append(batch_labels)

# Concatenate all batches into single numpy arrays
X_train = np.vstack(all_features)
y_train = np.vstack(all_labels)


Extracting features from existing pipeline...


100%|██████████| 9/9 [00:55<00:00,  6.16s/it]


In [16]:
# Containers for the full dataset
all_features_val = []
all_labels_val = []
for images, labels, encodings in tqdm(val_loader):
    
 
    # .view(batch_size, -1) flattens everything after the batch dimension
    batch_features_val = encodings.view(encodings.shape[0], -1).cpu().numpy()
    
    # 2. Convert labels to numpy
    batch_labels_val = labels.cpu().numpy()
    
    # 3. Collect
    all_features_val.append(batch_features_val)
    all_labels_val.append(batch_labels_val)

# Concatenate all batches into single numpy arrays
X_val = np.vstack(all_features_val)
y_val = np.vstack(all_labels_val)


100%|██████████| 2/2 [00:23<00:00, 11.68s/it]


In [17]:

print(f"Final Feature Matrix X: {X_val.shape}") # Should be (Total_Images, 384)
print(f"Final Label Matrix y:   {y_val.shape}") # Should be (Total_Images, 5)

Final Feature Matrix X: (71, 384)
Final Label Matrix y:   (71, 5)


In [18]:

print(f"Final Feature Matrix X: {X_train.shape}") # Should be (Total_Images, 384)
print(f"Final Label Matrix y:   {y_train.shape}") # Should be (Total_Images, 5)

Final Feature Matrix X: (286, 384)
Final Label Matrix y:   (286, 5)


In [19]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


# --- MODEL 1: XGBoost ---
print("Training XGBoost...")
# Note: 'tree_method': 'hist' makes it much faster
xgb_est = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=6, tree_method='hist', n_jobs=-1)
model_xgb = MultiOutputRegressor(xgb_est)
model_xgb.fit(X_train, y_train)

# --- MODEL 2: LightGBM ---
print("Training LightGBM...")
lgb_est = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=31, n_jobs=-1, verbose=-1)
model_lgb = MultiOutputRegressor(lgb_est)
model_lgb.fit(X_train, y_train)

# --- PREDICT & ENSEMBLE ---
print("Calculating Ensemble Predictions...")
preds_xgb = model_xgb.predict(X_val)
preds_lgb = model_lgb.predict(X_val)

# Average the predictions (50% XGB + 50% LGBM)
ensemble_preds = (0.5 * preds_xgb) + (0.5 * preds_lgb)

# --- POST-PROCESSING (Optional but recommended) ---
# Biomass cannot be negative, so clip predictions to 0 min
ensemble_preds = np.maximum(ensemble_preds, 0)

# Calculate Score
val_mae = mean_absolute_error(y_val, ensemble_preds)
print(f"Ensemble Validation MAE: {val_mae:.4f}")

Training XGBoost...
Training LightGBM...
Calculating Ensemble Predictions...
Ensemble Validation MAE: 8.6996


In [20]:
import joblib
import os

# Create a folder for models if it doesn't exist
os.makedirs("models", exist_ok=True)

print("Saving models...")

# 1. Save the XGBoost Ensemble
joblib.dump(model_xgb, "models/xgboost_ensemble.pkl")

# 2. Save the LightGBM Ensemble
joblib.dump(model_lgb, "models/lightgbm_ensemble.pkl")

print("Models saved successfully in the 'models' folder!")

Saving models...
Models saved successfully in the 'models' folder!
