In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 🚀 Optimized Soil Classification Training Pipeline

In [5]:
import os
import zipfile
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import convnext_tiny
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import train_test_split
from torch.cuda.amp import GradScaler, autocast
from tqdm import tqdm

# ✅ Device config

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



# 🔧 Kaggle credentials

In [7]:
os.environ['KAGGLE_USERNAME'] = 'shivam790'
os.environ['KAGGLE_KEY'] = '382c7bf4cacced2e14d0360737efc6c9'


# 📥 Download and Extract Dataset

In [8]:
!kaggle competitions download -c soil-classification
with zipfile.ZipFile("soil-classification.zip", 'r') as zip_ref:
    zip_ref.extractall("soil_data")

# 📄 Load and encode labels

In [9]:
base_path = "soil_data/soil_classification-2025"
train_csv = os.path.join(base_path, "train_labels.csv")
test_csv = os.path.join(base_path, "test_ids.csv")
train_dir = os.path.join(base_path, "train")
test_dir = os.path.join(base_path, "test")

# 📄 Load and encode labels
train_df = pd.read_csv(train_csv)
train_df["image_path"] = train_df["image_id"].apply(lambda x: os.path.join(train_dir, x))
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["soil_type"])


# 📂 Dataset classes

In [10]:
class SoilDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['image_path']
        label = self.df.iloc[idx]['label']
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

class TestSoilDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.iloc[idx]['image_id']
        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image


# 🎨 Transforms

In [11]:
transform_train = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.AutoAugment(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
transform_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

# 🔀 Split data

In [12]:
train_split, val_split = train_test_split(train_df, test_size=0.15, stratify=train_df['label'])
train_dataset = SoilDataset(train_split, transform_train)
val_dataset = SoilDataset(val_split, transform_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)



# 🧠 Load ConvNeXt Tiny

In [13]:
model = convnext_tiny(weights='IMAGENET1K_V1')
model.classifier[2] = nn.Linear(model.classifier[2].in_features, len(label_encoder.classes_))
model.to(device)

Downloading: "https://download.pytorch.org/models/convnext_tiny-983f1562.pth" to /root/.cache/torch/hub/checkpoints/convnext_tiny-983f1562.pth
100%|██████████| 109M/109M [00:00<00:00, 174MB/s] 


ConvNeXt(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      (1): LayerNorm2d((96,), eps=1e-06, elementwise_affine=True)
    )
    (1): Sequential(
      (0): CNBlock(
        (block): Sequential(
          (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
          (1): Permute()
          (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          (3): Linear(in_features=96, out_features=384, bias=True)
          (4): GELU(approximate='none')
          (5): Linear(in_features=384, out_features=96, bias=True)
          (6): Permute()
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): CNBlock(
        (block): Sequential(
          (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
          (1): Permute()
          (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          (3): Linear(in_features=

In [14]:
# ⚙️ Training loop with AMP + F1 tracking
def train_model(model, train_loader, val_loader, epochs=5):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2)
    scaler = GradScaler()

    best_f1 = 0.0
    for epoch in range(epochs):
        model.train()
        train_preds, train_labels = [], []
        running_loss = 0.0

        for images, labels in tqdm(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            with autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            running_loss += loss.item()
            train_preds.extend(outputs.argmax(1).cpu().numpy())
            train_labels.extend(labels.cpu().numpy())

        train_f1 = f1_score(train_labels, train_preds, average='macro')
        print(f"Epoch {epoch+1}: Train Loss={running_loss/len(train_loader):.4f}, F1={train_f1:.4f}")

        # Validation
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                val_preds.extend(outputs.argmax(1).cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
        val_f1 = f1_score(val_labels, val_preds, average='macro')
        print(f"Validation F1 Score: {val_f1:.4f}")
        scheduler.step(val_f1)

        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), "best_model.pth")
    return model      
 


In [15]:
# 🚂 Train
model = train_model(model, train_loader, val_loader, epochs=5)

# 🧪 Load test data
test_df = pd.read_csv(test_csv)
test_dataset = TestSoilDataset(test_df, test_dir, transform_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 🔮 Predict
model.load_state_dict(torch.load("best_model.pth"))
model.eval()
preds = []
with torch.no_grad():
    for images in test_loader:
        images = images.to(device)
        outputs = model(images)
        probs = F.softmax(outputs, dim=1)
        preds.append(probs.cpu())

all_probs = torch.cat(preds, dim=0)
final_preds = torch.argmax(all_probs, dim=1)
final_labels = label_encoder.inverse_transform(final_preds.numpy())

# 📤 Submission
submission = pd.DataFrame({
    "image_id": test_df["image_id"],
    "soil_type": final_labels
})
submission.to_csv("optimized_submission.csv", index=False)

  scaler = GradScaler()
  with autocast():
100%|██████████| 33/33 [05:54<00:00, 10.74s/it]


Epoch 1: Train Loss=0.6302, F1=0.7592
Validation F1 Score: 0.9478


  with autocast():
100%|██████████| 33/33 [05:43<00:00, 10.42s/it]


Epoch 2: Train Loss=0.2333, F1=0.9218
Validation F1 Score: 0.9482


  with autocast():
100%|██████████| 33/33 [05:51<00:00, 10.66s/it]


Epoch 3: Train Loss=0.1865, F1=0.9307
Validation F1 Score: 0.9537


  with autocast():
100%|██████████| 33/33 [05:46<00:00, 10.49s/it]


Epoch 4: Train Loss=0.1286, F1=0.9491
Validation F1 Score: 0.9870


  with autocast():
100%|██████████| 33/33 [05:44<00:00, 10.44s/it]


Epoch 5: Train Loss=0.1006, F1=0.9564
Validation F1 Score: 0.9866
