In [None]:
import os
import random
import numpy as np
import pandas as pd

from dataclasses import dataclass
from typing import List, Tuple, Union

import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import cv2

from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
from sklearn.model_selection import StratifiedGroupKFold

In [None]:
def system_config (SEED_VALUE=42, package_list=None):

    random.seed(SEED_VALUE)
    np.random.seed(SEED_VALUE)
    torch.manual_seed(SEED_VALUE)

    def is_running_in_kaggle():
        return "KAGGLE_KERNEL_RUN_TYPE" in os.environ
    
    #GPU check
    if torch.cuda.is_available():
        print("Using CUDA GPU")

        if is_running_in_kaggle():
            print("Installing required packages...")
            !pip install {package_list}
        
        DEVICE = torch.device("cuda")
        print("Devide: ",  DEVICE)
        GPU_AVAILABLE = True

        torch.cuda.manual_seed_all(SEED_VALUE)
        torch.cuda.manual_seed(SEED_VALUE)

        torch.backends.cudnn.enabled = True       # Provides highly optimized primitives for DL operations.
        torch.backends.cudnn.deterministic = True # Insures deterministic even when above cudnn is enabled.
        torch.backends.cudnn.benchmark = False    # Setting to True can cause non-deterministic behavior.

    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
        print('Using Apple Silicon GPU')

        # Set the device to the Apple Silicon GPU Metal Performance Shader (MPS).
        DEVICE = torch.device("mps")
        print("Device: ", DEVICE)
        # Environment variable that allows PyTorch to fall back to CPU execution
        # when encountering operations that are not currently supported by MPS.
        os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
        GPU_AVAILABLE = True

        torch.mps.manual_seed(SEED_VALUE)
        torch.use_deterministic_algorithms(True)

    else:
        print('Using CPU')
        DEVICE = torch.device('cpu')
        print("Device: ", DEVICE)
        GPU_AVAILABLE = False

        if is_running_in_kaggle():
            print('Installing required packages...')
            !pip install {package_list}
            print('Note: Change runtime type to GPU for better performance.')

        torch.use_deterministic_algorithms(True)

    return str(DEVICE), GPU_AVAILABLE

DEVICE, GPU_AVAILABLE = system_config()

In [None]:
# entorn variables
RANDOM_SEED = 42
DATA_DIR = os.path.join("..", "data")
LABEL_WEIGHTS = {
    "Dry_Green_g": 0.1,
    "Dry_Dead_g": 0.1,
    "Dry_Clover_g": 0.1,
    "GDM_g": 0.2,
    "Dry_Total_g": 0.5
}
N_SPLITS = 5 
TARGET_NAMES= list(LABEL_WEIGHTS.keys())

In [None]:
#implement competition loss function 
def globally_weighted_r_squared(y_true: torch.tensor, y_pred: torch.tensor, target_class: list[str], eps: float = 1e-12) -> torch.tensor:
    y_true = y_true.view(-1)
    y_pred = y_pred.view(-1)

    if y_true.shape != y_pred.shape:
        raise ValueError(f"Shapes must match. Got {y_true.shape} vs {y_pred.shape}.")

    weights = torch.tensor([LABEL_WEIGHTS[cls] for cls in target_class], device=y_true.device, dtype=y_true.dtype)

    y_bar = torch.sum(weights * y_true)/weights.sum()
    ss_res = torch.sum(weights * (y_true - y_pred) ** 2)
    ss_tot = torch.sum(weights * (y_true - y_bar) ** 2) + eps

    r_squared = 1 - (ss_res / ss_tot)
    return r_squared

globally_weighted_r_squared(torch.tensor([1,2,3,4,5]), torch.tensor([0,0,0,0,0]),["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"])

In [None]:
#TODO: implement robust validation set
train_df = pd.read_csv(os.path.join(DATA_DIR, "raw", "train.csv"))
train_df["id"] = train_df["image_path"].apply(lambda row: row.replace("train/", "").replace(".jpg", ""))
train_df_ids = train_df.groupby("id", as_index=False).agg(
    Pre_GSHH_NDVI = ("Pre_GSHH_NDVI", "first"),
    Height_Ave_cm = ("Height_Ave_cm", "first"),
)

train_df_ids["ndvi_bin"] = pd.qcut(train_df_ids["Pre_GSHH_NDVI"], q=3, duplicates="drop")
train_df_ids["h_bin"] = pd.qcut(train_df_ids["Height_Ave_cm"], q=3, duplicates="drop")

train_df_ids["stratify_group"] = train_df_ids["ndvi_bin"].astype(str) + "_" + train_df_ids["h_bin"].astype(str)

sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state= RANDOM_SEED)

train_df_ids["fold"] = -1

X = train_df_ids[["Pre_GSHH_NDVI", "Height_Ave_cm"]]
y = train_df_ids["stratify_group"]
groups = train_df_ids["id"]

for fold, (_, val_idx) in enumerate(sgkf.split(X, y, groups=groups)):
    train_df_ids.loc[val_idx, "fold"] = fold

train_df = train_df.merge(train_df_ids[["id", "fold"]], on="id", how="left")
assert train_df["fold"].isna().sum() == 0

check = train_df_ids.groupby("fold")[["Pre_GSHH_NDVI", "Height_Ave_cm"]].describe()
check

In [None]:
train_df.head()

In [None]:
FOLD = 0 #Choose validation fold

train_df_fold = train_df[train_df["fold"] != FOLD].reset_index(drop=True)
val_df_fold = train_df[train_df["fold"] == FOLD].reset_index(drop=True)

assert set(train_df_fold["id"]).isdisjoint(set(val_df_fold["id"]))

def build_targets(df, target_names=TARGET_NAMES):
    return (
        df.pivot(index="id", columns="target_name", values="target")
        .loc[:, target_names]
        .reset_index()
    )

train_targets = build_targets(train_df_fold, TARGET_NAMES)
val_targets = build_targets(val_df_fold, TARGET_NAMES)

val_targets.head()

In [None]:
class BiomassDataset(Dataset):
    def __init__(self, df, image_root, transform=None):
        self.df = df
        self.image_root = image_root
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        img_path = os.path.join(self.image_root, row["image_path"])
        image = cv2.imread(img_path)
        image = image[:, :, 1] #Take only green chanel 

        if self.transform:
            image = self.transform(image)

        y =  torch.tensor(
            row[TARGET_NAMES].values,
            dtype=torch.float32
        )
        return image, y

In [None]:
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(degrees=15),
    transforms.ToTensor()
])

val_transforms = transforms.Compose([
    transforms.ToTensor()
])

In [None]:
train_ds = BiomassDataset(
    train_targets, 
    image_root=os.path.join(DATA_DIR, "resized"),
    transform=train_transforms
)

val_df = BiomassDataset(
    val_targets,
    image_root=os.path.join(DATA_DIR, "resized"),
    transform=val_transforms
)

train_loader = DataLoader(train_df, batch_size=16, shuffle=True, num_workers=4)
val_loader = DataLoader(val_df, batch_size=16, shuffle=True, num_workers=4)


In [None]:
class Regressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.n_outputs = len(TARGET_NAMES)

        self.model = nn.Sequential(
            nn.Linear(in_features=64*128, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=self.n_outputs)
        )
    
    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x= self.model(x)
        return x

def train(DEVICE: torch.device,
          model: nn.Module,
          optimizer: optim.Optimizer,
          train_loader: DataLoader,
          epoch_index: int
           ) -> Tuple[float, float]:
    
    model.train() # Set model to training mode
    model.to(DEVICE) # Move model to device

    step_loss = 0.0
    step_r_squared = 0.0

    all_y_true = []
    all_y_pred = []
    all_target_class = []

    for images, targets in train_loader:
        images = images.to(DEVICE)
        targets = targets.to(DEVICE)

        optimizer.zero_grad()

        #forward
        outputs = model(images)
        loss = nn.MSELoss()(outputs, targets)

        # Backprop
        loss.backward()
        optimizer.step()

        batch_size = images.size(0)
        step_loss += loss.item() * batch_size
        n_samples += batch_size 

        B, T = targets.shape
        all_y_true.append(targets.detach().cpu().view(-1))
        all_y_pred.append(outputs.detach().cpu().view(-1))

        all_target_class.extend(TARGET_NAMES * B)

    epoch_loss = step_loss/n_samples
    y_true = torch.cat(all_y_true)
    y_pred = torch.cat(all_y_pred)
    epoch_r2 = globally_weighted_r_squared(y_true, y_pred, all_target_class).item()
    print(f"Epoch {epoch_index}: Train loss={epoch_loss:.4f}, R^2={epoch_r2:.4f}")
    return epoch_loss, epoch_r2

def validate