<a href="https://colab.research.google.com/github/saikoushiknalubola/anndata_annam/blob/main/Challenge-2/Soil_Classification2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

soil_classification_part_2_path = kagglehub.competition_download('soil-classification-part-2')

print('Data source import complete.')


In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import f1_score

# Set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Paths
TRAIN_DIR = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'
TEST_DIR = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'
TRAIN_CSV = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv'
TEST_CSV = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv'

In [None]:
# Load dataframes
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

In [None]:
# Image transforms
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [None]:
# Dataset class
class SoilBinaryDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, is_test=False):
        self.df = df.reset_index(drop=True)  # reset index for safe indexing
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.iloc[idx]['image_id']
        img_path = os.path.join(self.img_dir, img_id)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        if self.is_test:
            return image
        else:
            label = self.df.iloc[idx]['label']
            return image, label

In [None]:
# Create full training dataset (to split later)
full_dataset = SoilBinaryDataset(train_df, TRAIN_DIR, transform=train_transform, is_test=False)

In [None]:
# Split into train and validation datasets (80% train, 20% val)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

In [None]:
# For validation, use test_transform (no augmentation)
val_dataset.dataset.transform = test_transform  # override transform for val to no augmentation

In [None]:
# Data loaders
BATCH_SIZE = 32

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [None]:
# Test dataset and loader
test_dataset = SoilBinaryDataset(test_df, TEST_DIR, transform=test_transform, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [None]:
# Handle class imbalance: compute weights from original train_df labels
class_counts = train_df['label'].value_counts().to_dict()
total = sum(class_counts.values())
weights = [total / class_counts.get(i, 1) for i in range(2)]  # for classes 0 and 1
class_weights = torch.tensor(weights).float().to(DEVICE)

In [None]:
# Define model
class SoilBinaryClassifier(nn.Module):
    def __init__(self):
        super(SoilBinaryClassifier, self).__init__()
        self.base_model = models.resnet18(pretrained=True)
        num_ftrs = self.base_model.fc.in_features
        self.base_model.fc = nn.Linear(num_ftrs, 2)

    def forward(self, x):
        return self.base_model(x)

model = SoilBinaryClassifier().to(DEVICE)

In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop with validation
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0

    for inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        inputs = inputs.to(DEVICE)
        labels = labels.long().to(DEVICE)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    # Validation loop
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_acc = correct / total
    avg_val_loss = val_loss / len(val_loader)
    f1 = f1_score(all_labels, all_preds, average='weighted')  # weighted for multi-class or imbalanced

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {f1:.4f}")

Training Epoch 1: 100%|██████████| 31/31 [00:05<00:00,  5.26it/s]
Validation Epoch 1: 100%|██████████| 8/8 [00:01<00:00,  4.91it/s]


Epoch 1/10 | Train Loss: 0.0868 | Val Loss: 0.0001 | Val Acc: 1.0000 | Val F1: 1.0000


Training Epoch 2: 100%|██████████| 31/31 [00:06<00:00,  4.98it/s]
Validation Epoch 2: 100%|██████████| 8/8 [00:01<00:00,  5.26it/s]


Epoch 2/10 | Train Loss: 0.0001 | Val Loss: 0.0000 | Val Acc: 1.0000 | Val F1: 1.0000


Training Epoch 3: 100%|██████████| 31/31 [00:06<00:00,  5.06it/s]
Validation Epoch 3: 100%|██████████| 8/8 [00:01<00:00,  5.13it/s]


Epoch 3/10 | Train Loss: 0.0000 | Val Loss: 0.0000 | Val Acc: 1.0000 | Val F1: 1.0000


Training Epoch 4: 100%|██████████| 31/31 [00:06<00:00,  4.83it/s]
Validation Epoch 4: 100%|██████████| 8/8 [00:01<00:00,  4.98it/s]


Epoch 4/10 | Train Loss: 0.0000 | Val Loss: 0.0000 | Val Acc: 1.0000 | Val F1: 1.0000


Training Epoch 5: 100%|██████████| 31/31 [00:06<00:00,  5.03it/s]
Validation Epoch 5: 100%|██████████| 8/8 [00:01<00:00,  4.83it/s]


Epoch 5/10 | Train Loss: 0.0000 | Val Loss: 0.0000 | Val Acc: 1.0000 | Val F1: 1.0000


Training Epoch 6: 100%|██████████| 31/31 [00:06<00:00,  4.96it/s]
Validation Epoch 6: 100%|██████████| 8/8 [00:01<00:00,  5.18it/s]


Epoch 6/10 | Train Loss: 0.0000 | Val Loss: 0.0000 | Val Acc: 1.0000 | Val F1: 1.0000


Training Epoch 7: 100%|██████████| 31/31 [00:06<00:00,  5.10it/s]
Validation Epoch 7: 100%|██████████| 8/8 [00:01<00:00,  5.41it/s]


Epoch 7/10 | Train Loss: 0.0000 | Val Loss: 0.0000 | Val Acc: 1.0000 | Val F1: 1.0000


Training Epoch 8: 100%|██████████| 31/31 [00:06<00:00,  4.99it/s]
Validation Epoch 8: 100%|██████████| 8/8 [00:01<00:00,  5.41it/s]


Epoch 8/10 | Train Loss: 0.0000 | Val Loss: 0.0000 | Val Acc: 1.0000 | Val F1: 1.0000


Training Epoch 9: 100%|██████████| 31/31 [00:05<00:00,  5.48it/s]
Validation Epoch 9: 100%|██████████| 8/8 [00:01<00:00,  5.10it/s]


Epoch 9/10 | Train Loss: 0.0000 | Val Loss: 0.0000 | Val Acc: 1.0000 | Val F1: 1.0000


Training Epoch 10: 100%|██████████| 31/31 [00:06<00:00,  5.15it/s]
Validation Epoch 10: 100%|██████████| 8/8 [00:01<00:00,  5.24it/s]

Epoch 10/10 | Train Loss: 0.0000 | Val Loss: 0.0000 | Val Acc: 1.0000 | Val F1: 1.0000





In [None]:
# Prediction on test set
model.eval()
predictions = []

with torch.no_grad():
    for inputs in tqdm(test_loader, desc="Predicting on test set"):
        inputs = inputs.to(DEVICE)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())

Predicting on test set: 100%|██████████| 31/31 [00:04<00:00,  6.48it/s]


In [None]:
# Create submission DataFrame and save
submission = pd.DataFrame({
    'image_id': test_df['image_id'],
    'label': predictions
})
submission.to_csv('submission.csv', index=False)
print("Saved submission.csv")

Saved submission.csv
