# Soil Image Classification – Advanced Pipeline
This notebook implements:
- 5-Fold Stratified CV ensemble with EfficientNet-B3
- Class-weighted Focal Loss
- Cosine Annealing LR Scheduler
- Test-Time Augmentation (TTA)
- Pseudo-Labeling of high-confidence test predictions
- Limited to 5 epochs per training phase

In [2]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision import models
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Paths and dataframes
DATA_DIR = 'soil_classification-2025'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
TEST_DIR = os.path.join(DATA_DIR, 'test')
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train_labels.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test_ids.csv'))

# Label encoding
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['soil_type'])

In [4]:
class SoilDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, test=False):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.test = test
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(os.path.join(self.img_dir, row['image_id'])).convert('RGB')
        if self.transform:
            img = self.transform(img)
        if self.test:
            return img, row['image_id']
        return img, row['label']

In [5]:
transform_train = transforms.Compose([
    transforms.Resize((224,224)), transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15), transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(), transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])
transform_test = transforms.Compose([
    transforms.Resize((224,224)), transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

In [6]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2, weight=None):
        super().__init__()
        self.gamma = gamma
        self.ce = nn.CrossEntropyLoss(weight=weight)
    def forward(self, input, target):
        logpt = -self.ce(input, target)
        pt = torch.exp(logpt)
        loss = -((1-pt)**self.gamma) * logpt
        return loss.mean()

# Function to get model
def get_model(weights=None):
    m = models.efficientnet_b3(pretrained=True)
    m.classifier[1] = nn.Linear(m.classifier[1].in_features, 4)
    if weights:
        m.load_state_dict(torch.load(weights))
    return m.to(device)

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_probs = np.zeros((len(test_df), 4))

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
    print(f'Fold {fold+1}')
    tr_df = train_df.iloc[train_idx].reset_index(drop=True)
    vl_df = train_df.iloc[val_idx].reset_index(drop=True)
    
    # Compute class weights
    freq = tr_df['label'].value_counts().sort_index().values
    weights = 1.0 / torch.tensor(freq, dtype=torch.float).to(device)
    criterion = FocalLoss(weight=weights)
    
    # DataLoaders
    train_ds = SoilDataset(tr_df, TRAIN_DIR, transform_train)
    val_ds = SoilDataset(vl_df, TRAIN_DIR, transform_train)
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)

    model = get_model()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
    best_f1 = 0.0

    for epoch in range(5):
        model.train(); total_loss=0
        for imgs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            imgs, labels = imgs.to(device), labels.to(device)
            optimizer.zero_grad()
            out = model(imgs)
            loss = criterion(out, labels)
            loss.backward(); optimizer.step(); total_loss += loss.item()
        scheduler.step()
        
        # Validate
        model.eval(); preds, truths = [], []
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs = imgs.to(device)
                out = model(imgs); _, p = torch.max(out,1)
                preds.extend(p.cpu().numpy()); truths.extend(labels.numpy())
        f1s = f1_score(truths, preds, average=None)
        min_f1 = f1s.min(); print(f'Epoch {epoch+1} Min F1: {min_f1:.4f}')
        if min_f1 > best_f1:
            best_f1 = min_f1; torch.save(model.state_dict(), f'best_fold{fold}.pth')
    
    # Test set predictions with TTA
    tta_transforms = [
        transforms.RandomHorizontalFlip(p=1.0),
        transforms.RandomVerticalFlip(p=1.0),
        transforms.RandomRotation(10)
    ]
    model.load_state_dict(torch.load(f'best_fold{fold}.pth'))
    model.eval()
    for i, row in tqdm(test_df.iterrows(), total=len(test_df), desc='Test TTA'):
        img = Image.open(os.path.join(TEST_DIR, row['image_id'])).convert('RGB')
        probs = []
        for t in tta_transforms:
            aug = transforms.Compose([t, transform_test])
            x = aug(img).unsqueeze(0).to(device)
            with torch.no_grad():
                out = torch.softmax(model(x), dim=1).cpu().numpy()
            probs.append(out)
        avg = np.mean(probs, axis=0)
        test_probs[i] += avg.squeeze()

# Average ensemble probabilities
test_probs /= 5

Fold 1


Epoch 1:   0%|          | 0/31 [01:10<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Pseudo-label high-confidence predictions
conf_threshold = 0.99
pseudo_labels = np.argmax(test_probs, axis=1)
confidences = np.max(test_probs, axis=1)
pseudo_df = test_df[confidences > conf_threshold].copy()
pseudo_df['label'] = pseudo_labels[confidences > conf_threshold]
print(f'Pseudo-labeled {len(pseudo_df)} samples')

# Combine and retrain final model
combined_df = pd.concat([train_df, pseudo_df], ignore_index=True)
final_model = get_model()
freq = combined_df['label'].value_counts().sort_index().values
weights = 1.0 / torch.tensor(freq, dtype=torch.float).to(device)
criterion = FocalLoss(weight=weights)
optimizer = optim.Adam(final_model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)

ds = SoilDataset(combined_df, TRAIN_DIR, transform_train)
loader = DataLoader(ds, batch_size=32, shuffle=True)

for epoch in range(5):
    final_model.train(); total_loss = 0
    for imgs, labels in tqdm(loader, desc=f'Final Epoch {epoch+1}'):
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        out = final_model(imgs)
        loss = criterion(out, labels)
        loss.backward(); optimizer.step(); total_loss += loss.item()
    scheduler.step()
    print(f'Final Epoch {epoch+1} Loss: {total_loss/len(loader):.4f}')

# Final predictions
final_model.eval(); final_preds, ids = [], []
with torch.no_grad():
    for imgs, row in tqdm(test_df.iterrows(), total=len(test_df), desc='Final Predict'):
        img = Image.open(os.path.join(TEST_DIR, row['image_id'])).convert('RGB')
        x = transform_test(img).unsqueeze(0).to(device)
        out = final_model(x); _, p = torch.max(out,1)
        final_preds.append(p.item()); ids.append(row['image_id'])
pred_labels = le.inverse_transform(final_preds)
pd.DataFrame({'image_id': ids, 'soil_type': pred_labels}).to_csv('submission.csv', index=False)
print('submission.csv created')

Pseudo-labeled 0 samples


Final Epoch 1: 100%|██████████| 39/39 [33:33<00:00, 51.62s/it]


Final Epoch 1 Loss: 0.4475


Final Epoch 2:   0%|          | 0/39 [00:00<?, ?it/s]