# 🧠 Soil Classification - Training Notebook
Welcome to the training notebook for our soil classification challenge. In this notebook, we:
- Load labeled soil + synthetic data
- Train a classifier to detect "soil" vs "not soil"
- Track F1 score
- Save the best model

---


We'll split this into cells as they would appear in a notebook

## Cell 1: Imports & Setup

In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from src.preprocessing import SoilDataset, TestSoilDataset, get_transforms


Cell 2: Paths & Data Loading

In [None]:
BASE_DIR = './data/soil_competition-2025'
TRAIN_DIR = os.path.join(BASE_DIR, 'train')
TEST_DIR = os.path.join(BASE_DIR, 'test')
LABEL_FILE = os.path.join(BASE_DIR, 'train_labels.csv')

df = pd.read_csv(LABEL_FILE)

# Stratified split
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)


 Cell 3: Transforms and Datasets

In [None]:
train_transform, val_transform = get_transforms()

train_dataset = SoilDataset(train_df, TRAIN_DIR, train_transform)
val_dataset = SoilDataset(val_df, TRAIN_DIR, val_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)


✅ Using device: cuda (NVIDIA GeForce RTX 4050 Laptop GPU)




 cuda setup and check , working on laptop vs code locally

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = models.resnet18(pretrained=True)
model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, 1),
    nn.Sigmoid()
)
model = model.to(device)


PyTorch version: 2.5.1+cu121
CUDA available: True
GPU device name: NVIDIA GeForce RTX 4050 Laptop GPU


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = models.resnet18(pretrained=True)
model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, 1),
    nn.Sigmoid()
)
model = model.to(device)


CUDA Available: True
GPU Device: NVIDIA GeForce RTX 4050 Laptop GPU
Output shape: torch.Size([8, 1000])


In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


Loss & Optimizer

In [None]:
best_f1 = 0
epochs = 10
patience = 2
wait = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            probs = torch.argmax(outputs, dim=1)
            preds.extend(probs.cpu().numpy())
            targets.extend(labels.cpu().numpy())

    f1 = f1_score(targets, preds)
    print(f"Epoch {epoch+1}: Loss={total_loss:.4f}, Val F1={f1:.4f}")

    # Save best model
    if f1 > best_f1:
        torch.save(model.state_dict(), "../data/model_best.pth")
        best_f1 = f1
        wait = 0
        print("✅ Best model saved!")
    else:
        wait += 1
        if wait >= patience:
            print("🛑 Early stopping.")
            break


Training Function

In [None]:
def train_model(model, train_loader, val_loader, epochs=10):
    best_f1 = 0
    best_threshold = 0.5

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.float().unsqueeze(1).to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        all_probs = []
        all_targets = []

        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                all_probs.extend(outputs.cpu().numpy().flatten())
                all_targets.extend(labels.cpu().numpy())

        all_probs = np.array(all_probs)
        all_targets = np.array(all_targets)

        # Threshold tuning
        thresholds = np.arange(0.1, 0.9, 0.01)
        f1_scores = []

        for t in thresholds:
            preds = (all_probs > t).astype(int)
            f1 = f1_score(all_targets, preds)
            f1_scores.append(f1)

        max_f1 = max(f1_scores)
        max_thresh = thresholds[np.argmax(f1_scores)]

        if max_f1 > best_f1:
            best_f1 = max_f1
            best_threshold = max_thresh
            torch.save(model.state_dict(), './model_best.pth')  # Save best model

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val F1: {max_f1:.4f} at threshold {max_thresh:.2f}")

    return best_threshold


In [1]:
pip install torchviz


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting torchviz
  Downloading torchviz-0.0.3-py3-none-any.whl.metadata (2.1 kB)
Downloading torchviz-0.0.3-py3-none-any.whl (5.7 kB)
Installing collected packages: torchviz
Successfully installed torchviz-0.0.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Train the Model

In [None]:
best_threshold = train_model(model, train_loader, val_loader, epochs=10)
print(f"Best threshold selected: {best_threshold}")
