# Match Plants: Same or Different — Ablation: no pretrain — Group split by image ID

This notebook trains a model that looks at two plant photos and decides if they show the **same plant** or **different plants**.

You will get:
- A trained model.
- A CSV file `yourname_results.csv` with predictions for the test pairs.

No coding experience is required to run it: just run the cells in order.

In [1]:
# Basic tools and settings
import os
import numpy as np
import pandas as pd
import torch
import torchvision
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Repeatable results
np.random.seed(1)
torch.manual_seed(1)

# Use Apple GPU (MPS) if available, otherwise fall back to CPU
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"

# File locations
DATA_DIR = "/Users/shirong/match_plants/data"
IMG_DIR = os.path.join(DATA_DIR, "data")
TRAIN_CSV = os.path.join(DATA_DIR, "train_data.csv")
TEST_CSV = os.path.join(DATA_DIR, "test_data.csv")

## Step 1: Load the pair lists

`train_data.csv` includes the correct answer (`class`) for each pair.
`test_data.csv` does **not** include answers, so we predict them.

In [2]:
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print("device", DEVICE)
print("train", train_df.shape)
print("test", test_df.shape)
print(train_df["class"].value_counts())
train_df.head()

device mps
train (2400, 4)
test (600, 3)
class
0    1601
1     799
Name: count, dtype: int64


Unnamed: 0,Pair_Num,img_idx1,img_idx2,class
0,372,182,684,0
1,71,477,990,0
2,2561,769,240,1
3,1104,906,36,0
4,2149,123,344,1


## Step 2: Define how to read image pairs

This `Dataset` tells PyTorch how to load two images and (for training) the label that says whether they match.

In [3]:
class PairDataset(Dataset):
    def __init__(self, df, transform, is_test=False):
        self.df = df.reset_index(drop=True)
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Read two images by ID
        img1 = Image.open(os.path.join(IMG_DIR, f"{row.img_idx1}.jpg")).convert("RGB")
        img2 = Image.open(os.path.join(IMG_DIR, f"{row.img_idx2}.jpg")).convert("RGB")
        # Convert images into tensors
        x1 = self.transform(img1)
        x2 = self.transform(img2)
        if self.is_test:
            return x1, x2
        # For training, also return the label
        return x1, x2, torch.tensor(row["class"], dtype=torch.float32)

## Step 3: Image preprocessing

We resize and normalize images so the pretrained model can work well.
For training we also use small random changes (augmentation) to improve generalization.

In [4]:
# ImageNet normalization values
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)

# Training images get random crops and flips
train_tf = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.6, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std),
])

# Validation and test images use fixed center crop
val_tf = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std),
])

## Step 4: The model (Siamese ResNet18)

Each image is passed through the same pretrained ResNet18 to get a feature vector.
We then compare the two vectors and predict if they are the same plant.

In [5]:
class SiameseNet(nn.Module):
    def __init__(self):
        super().__init__()
        # Pretrained backbone that turns an image into a 512-dim vector
        backbone = torchvision.models.resnet18(
            weights=None
        )
        backbone.fc = nn.Identity()
        self.backbone = backbone

        # Small head that compares two image vectors
        self.head = nn.Sequential(
            nn.Linear(512 * 2, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(256, 1),
        )

    def forward(self, x1, x2):
        f1 = self.backbone(x1)
        f2 = self.backbone(x2)
        # Compare the two features
        feat = torch.cat([torch.abs(f1 - f2), f1 * f2], dim=1)
        # Output a single logit (later turned into a probability)
        return self.head(feat).squeeze(1)

## Step 5: Train and pick the best threshold

We split the training pairs into a train and validation set.
During validation we search for the best probability threshold `best_t` that maximizes F1.

In [6]:
# Train/val split by image IDs (avoid leakage)
all_ids = pd.Index(sorted(set(train_df["img_idx1"]).union(set(train_df["img_idx2"]))))
train_ids, val_ids = train_test_split(
    all_ids, test_size=0.2, random_state=42
)
train_ids = set(train_ids)
val_ids = set(val_ids)

train_pairs = train_df[
    train_df["img_idx1"].isin(train_ids) & train_df["img_idx2"].isin(train_ids)
]
val_pairs = train_df[
    train_df["img_idx1"].isin(val_ids) & train_df["img_idx2"].isin(val_ids)
]

print("train pairs", train_pairs.shape, "val pairs", val_pairs.shape)


epoch 1 loss 0.9447 val_f1 0.5086 best_t 0.49


epoch 2 loss 0.9277 val_f1 0.5191 best_t 0.44


epoch 3 loss 0.9180 val_f1 0.5477 best_t 0.40


epoch 4 loss 0.9064 val_f1 0.5632 best_t 0.37


epoch 5 loss 0.8900 val_f1 0.5286 best_t 0.45
best overall 0.5632183908045977 0.37


## Step 6: Train a final model on all training data

Now we train once more using **all** training pairs.
We reuse `best_t` from the validation step.

In [7]:
full_ds = PairDataset(train_df, train_tf)
full_loader = DataLoader(full_ds, batch_size=16, shuffle=True, num_workers=0)

final_model = SiameseNet().to(DEVICE)

pos = float(train_df["class"].sum())
neg = float(len(train_df) - pos)
pos_weight = torch.tensor([neg / pos], dtype=torch.float32, device=DEVICE)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(final_model.parameters(), lr=1e-4, weight_decay=1e-4)

for epoch in range(1, 6):
    final_model.train()
    total_loss = 0.0
    for x1, x2, y in full_loader:
        x1 = x1.to(DEVICE)
        x2 = x2.to(DEVICE)
        y = y.to(DEVICE)
        optimizer.zero_grad()
        logits = final_model(x1, x2)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * y.size(0)
    print(f"final epoch {epoch} loss {total_loss/len(full_ds):.4f}")

final epoch 1 loss 0.9316


final epoch 2 loss 0.9258


final epoch 3 loss 0.8967


final epoch 4 loss 0.8837


final epoch 5 loss 0.8432


## Step 7: Predict for the test pairs

We apply the trained model to the test pairs and write `yourname_results.csv`.

In [8]:
# Predict on test set and write submission
final_model.eval()

test_ds = PairDataset(test_df, val_tf, is_test=True)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, num_workers=0)

all_probs = []
with torch.no_grad():
    for x1, x2 in test_loader:
        x1 = x1.to(DEVICE)
        x2 = x2.to(DEVICE)
        logits = final_model(x1, x2)
        probs = torch.sigmoid(logits).cpu().numpy().astype(np.float32)
        all_probs.append(probs)

all_probs = np.concatenate(all_probs)

# Turn probabilities into 0/1 predictions using best_t
preds = (all_probs >= best_t).astype(int)

out = pd.DataFrame({
    "Pair_Num": test_df["Pair_Num"],
    "Predicted_Result": preds,
})

out_path = "shirong_results.csv"
out.to_csv(out_path, index=False)

out.head()

Unnamed: 0,Pair_Num,Predicted_Result
0,2458,1
1,2172,1
2,2658,0
3,1278,0
4,894,1


## Notes and tips

- `val_f1` is the F1 score on the validation split. Higher is better.
- `best_t` is the best probability threshold found for F1.
- The test file has no labels, so the output is a prediction only.
- Rename `yourname_results.csv` to match the required naming rule.