In [None]:

"""
Author: sanskar khandelwal
Team Name: TheLastTransformer
Team Members: sanskar khandelwal 
Leaderboard Rank: 56
"""


# Cell 1: Notebook Metadata
This cell specifies author information, team details, and leaderboard rank for documentation.

#  Soil Classification - Training Notebook
Welcome to the training notebook for our soil classification challenge. In this notebook, we:
-  labeled soil + resnet18 + one shot svm 
- Train a classifier to detect "soil" vs "not soil"
- Track F1 score
- Save the best model

---


# Cell : Step 1 - Import Libraries Section
Provide a section header indicating the start of library imports.

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm

In [None]:
### 🖼️ Step 2: Image Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])


# Cell 6: Step 2 - Image Transformations
Define the image preprocessing pipeline including resizing, tensor conversion, and normalization.

In [None]:
train_transform, val_transform = get_transforms()

train_dataset = SoilDataset(train_df, TRAIN_DIR, train_transform)
val_dataset = SoilDataset(val_df, TRAIN_DIR, val_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)


✅ Using device: cuda (NVIDIA GeForce RTX 4050 Laptop GPU)




# Cell 8: Create Datasets and DataLoaders
Instantiate `SoilDataset` objects and wrap them in DataLoaders with batching and shuffling.

# Cell 9: CUDA Setup Note
Describe checking for GPU availability and local VS Code configuration.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = models.resnet18(pretrained=True)
model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, 1),
    nn.Sigmoid()
)
model = model.to(device)


PyTorch version: 2.5.1+cu121
CUDA available: True
GPU device name: NVIDIA GeForce RTX 4050 Laptop GPU


# Cell 10: Model Initialization
Set device to GPU/CPU, load pretrained ResNet18, modify final layer, and move model to device.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = models.resnet18(pretrained=True)
model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, 1),
    nn.Sigmoid()
)
model = model.to(device)


CUDA Available: True
GPU Device: NVIDIA GeForce RTX 4050 Laptop GPU
Output shape: torch.Size([8, 1000])


# Cell 11: Duplicate Model Setup
Repeat device and model initialization (likely redundant).

In [None]:
### 📂 Step 4: Paths and CSVs
train_csv = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv'
test_csv = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv'
train_dir = '/kaggle/input/soil-classification-part-2/soil_competition-2025/train'
test_dir = '/kaggle/input/soil-classification-part-2/soil_competition-2025/test'

train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)


# Cell 12: Step 4 - Paths and CSV Loading
Define dataset paths and load training and test CSV metadata into DataFrames.

Loss & Optimizer

# Cell 13: Loss & Optimizer Section Header
Introduce the configuration of loss function and optimizer for training.

In [None]:
### 🧺 Step 5: Custom Dataset
class SoilDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform):
        self.df = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_id = self.df.iloc[idx]['image_id']
        image_path = os.path.join(self.img_dir, image_id)
        image = Image.open(image_path).convert("RGB")
        image = self.transform(image)
        return image, image_id

# Cell 14: Custom Dataset Class
Implement `SoilDataset` to load images, apply transforms, and return tensors with IDs.

Training Function

# Cell 15: Training Function Header
Section header for defining the model training loop function.

In [None]:
def train_model(model, train_loader, val_loader, epochs=10):
    best_f1 = 0
    best_threshold = 0.5

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.float().unsqueeze(1).to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        all_probs = []
        all_targets = []

        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                all_probs.extend(outputs.cpu().numpy().flatten())
                all_targets.extend(labels.cpu().numpy())

        all_probs = np.array(all_probs)
        all_targets = np.array(all_targets)

        # Threshold tuning
        thresholds = np.arange(0.1, 0.9, 0.01)
        f1_scores = []

        for t in thresholds:
            preds = (all_probs > t).astype(int)
            f1 = f1_score(all_targets, preds)
            f1_scores.append(f1)

        max_f1 = max(f1_scores)
        max_thresh = thresholds[np.argmax(f1_scores)]

        if max_f1 > best_f1:
            best_f1 = max_f1
            best_threshold = max_thresh
            torch.save(model.state_dict(), './model_best.pth')  # Save best model

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val F1: {max_f1:.4f} at threshold {max_thresh:.2f}")

    return best_threshold


# Cell 16: Define Training Loop
Implement `train_model` to handle training epochs, validation, threshold tuning, and checkpointing.

In [None]:
train_loader = DataLoader(SoilDataset(train_df, train_dir, transform), batch_size=32, shuffle=False)
test_loader = DataLoader(SoilDataset(test_df, test_dir, transform), batch_size=32, shuffle=False)


# Cell 17: Initialize DataLoaders for Feature Extraction
Create DataLoaders specifically for the feature extraction phase.

In [None]:
### 🔍 Step 6: Feature Extraction
def extract_features(dataloader, model, device):
    features = []
    ids = []
    with torch.no_grad():
        for images, image_ids in tqdm(dataloader, desc="Extracting features"):
            images = images.to(device)
            feats = model(images).cpu().numpy()
            features.append(feats)
            ids.extend(image_ids)
    return np.vstack(features), ids

train_features, _ = extract_features(train_loader, resnet, device)
test_features, test_ids = extract_features(test_loader, resnet, device)

# Cell 18: Step 6 - Feature Extraction
Define and apply feature extraction using the trained model and DataLoader.

In [1]:
pip install torchviz


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting torchviz
  Downloading torchviz-0.0.3-py3-none-any.whl.metadata (2.1 kB)
Downloading torchviz-0.0.3-py3-none-any.whl (5.7 kB)
Installing collected packages: torchviz
Successfully installed torchviz-0.0.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Cell 19: Install Additional Dependencies
Install packages like `torchviz` required for model visualization.

### 💾 Step 7: Save Features

# Cell 20: Step 7 - Save Features Section
Section header for saving extracted feature arrays to disk.

In [None]:

np.save("train_features.npy", train_features)
np.save("test_features.npy", test_features)
np.save("test_ids.npy", test_ids)



# Cell 21: Save Extracted Features
Save NumPy arrays of train features, test features, and IDs for downstream tasks.