<a href="https://colab.research.google.com/github/saikoushiknalubola/anndata_annam/blob/main/Challenge-1./notebooks/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

soil_classification_path = kagglehub.competition_download('soil-classification')

print('Data source import complete.')


In [None]:
### training.ipynb

"""
Author: Annam.ai IIT Ropar
Team Name: anndata
Team Members: N. Saikoushik, M. Sai Teja, G. Navya Sri, N. Chandhana Priya, V. Asmitha
Leaderboard Rank: 22
"""

import os
import numpy as np
import pandas as pd
import torch
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data paths
data_dir = "/kaggle/input/soil-classification/soil_classification-2025"
train_dir = os.path.join(data_dir, "train")
train_labels_file = os.path.join(data_dir, "train_labels.csv")

# Load train labels
df = pd.read_csv(train_labels_file)

# Label encoding
le = LabelEncoder()
df['encoded_label'] = le.fit_transform(df['soil_type'])

# Image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

import torch
import torchvision.models as models

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained ResNet18 model (ImageNet weights) for feature extraction
model_cnn = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model_cnn.fc = torch.nn.Identity()  # remove classification head to get features

# Move model to device and set to evaluation mode
model_cnn = model_cnn.to(device)
model_cnn.eval()

# Extract features
def extract_features(image_paths):
    features = []
    for path in tqdm(image_paths):
        img = Image.open(path).convert('RGB')
        img_tensor = transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            feat = model_cnn(img_tensor).cpu().numpy().flatten()
        features.append(feat)
    return np.array(features)

image_paths = [os.path.join(train_dir, fname) for fname in df['image_id']]
X = extract_features(image_paths)
y = df['encoded_label'].values

# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X[train_idx], y[train_idx])
    val_preds = clf.predict(X[val_idx])
    f1 = f1_score(y[val_idx], val_preds, average='macro')
    fold_scores.append(f1)
    print(f"Fold {fold+1} F1-score: {f1:.4f}")
    print(classification_report(y[val_idx], val_preds, target_names=le.classes_))

print(f"Average F1-score: {np.mean(fold_scores):.4f}")

# Save the final model
final_clf = RandomForestClassifier(n_estimators=100, random_state=42)
final_clf.fit(X, y)
import joblib
joblib.dump(final_clf, "final_model.pkl")
np.save("label_encoder_classes.npy", le.classes_)
print("Final model and label encoder saved.")


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 161MB/s]
100%|██████████| 1222/1222 [01:19<00:00, 15.39it/s]


Fold 1 F1-score: 0.9394
               precision    recall  f1-score   support

Alluvial soil       0.95      0.93      0.94       106
   Black Soil       0.95      0.91      0.93        46
    Clay soil       1.00      0.93      0.96        40
     Red soil       0.87      0.98      0.92        53

     accuracy                           0.94       245
    macro avg       0.94      0.94      0.94       245
 weighted avg       0.94      0.94      0.94       245

Fold 2 F1-score: 0.9394
               precision    recall  f1-score   support

Alluvial soil       0.95      0.94      0.95       106
   Black Soil       0.98      0.94      0.96        47
    Clay soil       0.90      0.90      0.90        40
     Red soil       0.93      0.98      0.95        52

     accuracy                           0.94       245
    macro avg       0.94      0.94      0.94       245
 weighted avg       0.94      0.94      0.94       245

Fold 3 F1-score: 0.9545
               precision    recall  f1-sco