In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms
from torchvision.models import resnet50
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from PIL import Image, ImageFilter, ImageEnhance
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

# --- 0. Setup Kaggle and Data ---
os.environ['KAGGLE_USERNAME'] = 'shivam790'
os.environ['KAGGLE_KEY'] = '382c7bf4cacced2e14d0360737efc6c9'

if not os.path.exists("soil-classification-part-2.zip"):
    os.system("kaggle competitions download -c soil-classification-part-2")

if not os.path.exists("soil_competition-2025"):
    with zipfile.ZipFile("soil-classification-part-2.zip", 'r') as zip_ref:
        zip_ref.extractall("soil_competition-2025")

In [2]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import numpy as np
import timm  # For DINOv2

# --- Paths ---
train_csv = "/kaggle/working/soil_competition-2025/soil_competition-2025/train_labels.csv"
train_dir = "/kaggle/working/soil_competition-2025/soil_competition-2025/train"
test_csv = "/kaggle/working/soil_competition-2025/soil_competition-2025/test_ids.csv"
test_dir = "/kaggle/working/soil_competition-2025/soil_competition-2025/test"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Load DINOv2 model ---
model = timm.create_model("vit_base_patch16_224.dino", pretrained=True)
model.head = torch.nn.Identity()
model.eval().to(DEVICE)

# --- Image preprocessing ---
from torchvision import transforms
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# --- Dataset Class ---
class ImageDataset(Dataset):
    def __init__(self, df, image_dir, transform):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_id = self.df.iloc[idx]['image_id']
        img_path = os.path.join(self.image_dir, image_id)
        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)
        return image, image_id

# --- Extract embeddings ---
def extract_embeddings(loader):
    all_embeddings = []
    all_ids = []
    with torch.no_grad():
        for images, ids in tqdm(loader, desc="Extracting embeddings"):
            images = images.to(DEVICE)
            feats = model(images)
            all_embeddings.append(feats.cpu().numpy())
            all_ids.extend(ids)
    return np.vstack(all_embeddings), all_ids

# --- Train loader ---
df_train = pd.read_csv(train_csv)
train_dataset = ImageDataset(df_train, train_dir, image_transform)
train_loader = DataLoader(train_dataset, batch_size=32)

train_feats, _ = extract_embeddings(train_loader)

# --- Normalize & Train One-Class SVM ---
scaler = StandardScaler()
train_feats_scaled = scaler.fit_transform(train_feats)

svm = OneClassSVM(kernel='rbf', nu=0.05, gamma='scale')
svm.fit(train_feats_scaled)

# --- Test loader ---
df_test = pd.read_csv(test_csv)
test_dataset = ImageDataset(df_test, test_dir, image_transform)
test_loader = DataLoader(test_dataset, batch_size=32)

test_feats, test_ids = extract_embeddings(test_loader)
test_feats_scaled = scaler.transform(test_feats)

preds = svm.predict(test_feats_scaled)

# +1 = soil (inlier), -1 = not soil (outlier)
results = [{"image_id": img_id, "label": 1 if pred == 1 else 0} for img_id, pred in zip(test_ids, preds)]

# --- Save Submission ---
df_out = pd.DataFrame(results)
df_out.to_csv("submission.csv", index=False)
print("✅ submission.csv saved.")


model.safetensors:   0%|          | 0.00/343M [00:00<?, ?B/s]

Extracting embeddings: 100%|██████████| 39/39 [07:26<00:00, 11.44s/it]
Extracting embeddings: 100%|██████████| 31/31 [05:11<00:00, 10.06s/it]

✅ submission.csv saved.



