In [None]:
# !pip install transformers 
# !pip install protobuf==3.20.3

In [5]:
import sys


# Add the path to the wheel file
# Note: Update 'protobuf-downloader' to match the name of your notebook from Step 2
!pip install /kaggle/input/my-proto-wheel/protobuf-3.20.3-py2.py3-none-any.whl

Processing /kaggle/input/my-proto-wheel/protobuf-3.20.3-py2.py3-none-any.whl
protobuf is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [6]:
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoImageProcessor, AutoModel

import torch.nn as nn
from tqdm import tqdm
import numpy as np

In [7]:
import warnings

warnings.filterwarnings('ignore')

# --- 1. Setup & Config ---
BASE_PATH = '/kaggle/input/csiro-biomass'
TEST_IMAGE_PATH = os.path.join(BASE_PATH)
TEST_META_PATH = os.path.join(BASE_PATH, 'test.csv')
 # Path to your saved model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



import cv2 
import pandas as pd
import numpy as np
import torch
import os
import joblib # For loading XGB/LGBM
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

MODEL_PATH = '/kaggle/input/dinov2/pytorch/base/1' 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- LOAD DINO MODEL ---
print("Loading DINOv2...")
processor = AutoImageProcessor.from_pretrained(MODEL_PATH)
dino_model = AutoModel.from_pretrained(MODEL_PATH)
dino_model.to(device)
dino_model.eval()

# --- LOAD ML MODELS ---
print("Loading Ensemble Models...")
# Make sure these paths match where you saved them
xgb_model = joblib.load("/kaggle/input/baseline-dino-v3/models/lightgbm_ensemble.pkl")
lgb_model = joblib.load("/kaggle/input/baseline-dino-v3/models/xgboost_ensemble.pkl")

Loading DINOv2...
Loading Ensemble Models...


In [8]:
# --- !! IMPORTANT: Use the CORRECT target columns !! ---
# These names are used to create the final submission file
TARGET_COLS =['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']

IMG_SIZE = 256 # Must be the same size you trained with

In [9]:
class BiomassTestDataset(Dataset):
    def __init__(self, df, base_path, processor, model):
        self.df = df
        self.image_paths = [os.path.join(base_path, p) for p in df['image_path']]
        self.processor = processor
        self.model = model
        self.device = model.device

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        
        # 1. Load Image
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # 2. Process Image for DINO
        inputs = self.processor(images=image, return_tensors="pt").to(self.device)
        
        # 3. Extract Features
        with torch.no_grad():
            outputs = self.model(**inputs)
            # Get the pooler_output and move to CPU immediately
            # Output shape from DINO is [1, 384]
            features = outputs.pooler_output.cpu()
            
        # Return the flattened feature vector [384,]
        return features.squeeze(0)

In [10]:
# --- Prepare DataLoader ---
test_df = pd.read_csv(TEST_META_PATH)

test_dataset = BiomassTestDataset(
    df=test_df,
    base_path=TEST_IMAGE_PATH,
    processor=processor,
    model=dino_model
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32, 
    shuffle=False,
    num_workers=2
)

# --- Run Feature Extraction ---
all_features = []

print("Extracting features from Test set...")
with torch.no_grad():
    for feature_batch in tqdm(test_loader, desc="Extracting"):
        # feature_batch shape is [32, 384]
        all_features.append(feature_batch.numpy())

# Create the Final Feature Matrix
X_test = np.vstack(all_features)
print(f"Test Feature Shape: {X_test.shape}") # Should be (N_test_images, 384)

Extracting features from Test set...


Extracting: 100%|██████████| 1/1 [00:05<00:00,  5.18s/it]

Test Feature Shape: (5, 768)





In [None]:
# print("Predicting with Ensemble...")

# # 1. Predict with XGBoost
# preds_xgb = xgb_model.predict(X_test)

# # 2. Predict with LightGBM
# preds_lgb = lgb_model.predict(X_test)

# # 3. Weighted Average (50/50)
# ensemble_preds = (0.5 * preds_xgb) + (0.5 * preds_lgb)

# # 4. Clip negative values (Biomass cannot be < 0)
# ensemble_preds = np.maximum(ensemble_preds, 0)

# print(f"Final Predictions Shape: {ensemble_preds.shape}")

In [13]:
N_TARGETS=len(TARGET_COLS)

In [15]:
META_LEARNER_COEFFS = np.array([
    [1.267, -0.217],  # Target 1
    [0.686, 0.262],   # Target 2
    [0.985, 0.211],   # Target 3
    [1.137, 0.003],   # Target 4
    [0.822, 0.372]    # Target 5
])

In [16]:
# --- PREDICTION STEP: APPLYING OPTIMAL WEIGHTS ---

print("\nPredicting with Optimal Ensemble Blend...")

# 1. Predict with XGBoost
preds_xgb = xgb_model.predict(X_test)

# 2. Predict with LightGBM
preds_lgb = lgb_model.predict(X_test)

# 3. Optimal Weighted Average (Target-Specific Stacking)
ensemble_preds_optimal = np.zeros_like(preds_xgb)

# Apply the learned optimal weights (coefficients) column by column
for i in range(N_TARGETS):
    xgb_weight = META_LEARNER_COEFFS[i, 0]
    lgbm_weight = META_LEARNER_COEFFS[i, 1]
    
    # Blend the predictions for the current target (column i)
    ensemble_preds_optimal[:, i] = (
        xgb_weight * preds_xgb[:, i] + 
        lgbm_weight * preds_lgb[:, i]
    )

# 4. Clip negative values (Biomass cannot be < 0)
ensemble_preds = np.maximum(ensemble_preds_optimal, 0)

print(f"Final Predictions Shape: {ensemble_preds.shape}")
# --- Create Submission File ---
print("\nCreating submission file...")


Predicting with Optimal Ensemble Blend...
Final Predictions Shape: (5, 5)

Creating submission file...


In [17]:

# --- Create Submission File ---
print("Creating submission file...")

# 1. Create a "wide" DataFrame with predictions
pred_df = pd.DataFrame(ensemble_preds, columns=TARGET_COLS)

# 2. Add the 'image_path' from the original test_df
submission_df = test_df[['image_path']].copy()
submission_df = pd.concat([submission_df, pred_df], axis=1)

# 3. "Melt" the DataFrame from "wide" to "long" format
submission_df_long = submission_df.melt(
    id_vars=['image_path'],
    value_vars=TARGET_COLS,
    var_name='target_name',
    value_name='target'
)

# 4. Create the 'sample_id'
# Clean up the filename (remove path and extension)
submission_df_long['image_name_base'] = submission_df_long['image_path'].apply(
    lambda p: os.path.splitext(os.path.basename(p))[0]
)

# Create ID: image_base + __ + target_name
submission_df_long['sample_id'] = submission_df_long['image_name_base'] + '__' + submission_df_long['target_name']

# 5. Filter duplicates and Select Columns
final_submission = submission_df_long[['sample_id', 'target']]
final_submission = final_submission.drop_duplicates(subset=['sample_id'])

# 6. Save
final_submission.to_csv('submission.csv', index=False)

print("submission.csv created successfully!")
print(final_submission.head())

Creating submission file...
submission.csv created successfully!
                     sample_id     target
0   ID1001187975__Dry_Clover_g   2.219094
5     ID1001187975__Dry_Dead_g  22.625428
10   ID1001187975__Dry_Green_g  31.627779
15   ID1001187975__Dry_Total_g  65.787762
20         ID1001187975__GDM_g  33.710126
