In [1]:
!git clone https://github.com/sakanaowo/PlantXViT

Cloning into 'PlantXViT'...
remote: Enumerating objects: 50280, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 50280 (delta 0), reused 5 (delta 0), pack-reused 50271 (from 1)[K
Receiving objects: 100% (50280/50280), 1.66 GiB | 61.52 MiB/s, done.
Resolving deltas: 100% (30385/30385), done.
Updating files: 100% (50019/50019), done.


preprocess apple here


In [2]:
%cd PlantXViT

/content/PlantXViT


In [3]:
from utils.config_loader import load_config

config = load_config('configs/config.yaml')

In [4]:
!pip install pandas



In [5]:
#import lib
import pandas as pd
import os
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset
from torchvision import transforms
import pickle

In [6]:
#load config
apple_config = config['dataset']['apple']
img_dir = apple_config['data_dir']
csv_path = apple_config['csv_path']
label_encoder_path = apple_config['label_encoder']
image_size = tuple(apple_config['image_size'])

In [9]:
# read csv, preprocess label
df = pd.read_csv(csv_path)
df['label'] = df[['healthy', 'multiple_diseases', 'rust', 'scab']].idxmax(axis=1)

In [10]:
# encode label
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])


In [12]:
# save label encoded
os.makedirs(os.path.dirname(label_encoder_path), exist_ok=True)
with open(label_encoder_path, 'wb') as f:
    pickle.dump(label_encoder, f)

In [13]:
print(label_encoder_path)

./data/processed/apple_label_encoder.pkl


In [18]:
# split train/val
train_df, val_df = train_test_split(df, test_size=0.2,
                                    stratify=df['label'],
                                    random_state=42)

# Encode labels
label_encoder = LabelEncoder()
train_df['label_idx'] = label_encoder.fit_transform(train_df['label'])
val_df['label_idx'] = label_encoder.transform(val_df['label'])

transform from here


In [39]:
image_dir = "./data/raw/plant-pathology-2020-fgvc7/images"
train_df = pd.read_csv("./data/processed/apple_train.csv")
val_df = pd.read_csv("./data/processed/apple_val.csv")

In [40]:
transform = transforms.Compose([
    transforms.Resize(image_size),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

class AppleDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform):
        self.df = dataframe
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.image_dir, row['image_id'] + ".jpg")
        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)
        label = torch.tensor(row['label_idx'])
        return image, label


In [41]:
#Data loader
from torch.utils.data import DataLoader

batch_size = config["training"]["batch_size"]

train_dataset = AppleDataset(train_df, img_dir, transform)
val_dataset = AppleDataset(val_df, img_dir, transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [19]:
# save preprocessed train and val csv
os.makedirs("./data/processed", exist_ok=True)

train_df[['image_id', 'label', 'label_idx']].to_csv("./data/processed/apple_train.csv", index=False)
val_df[['image_id', 'label', 'label_idx']].to_csv("./data/processed/apple_val.csv", index=False)

Building model here


In [34]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision.models import VGG16_Weights

In [21]:

# inception block
class InceptionBlock(nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.branch1x1 = nn.Conv2d(in_channels, 128, kernel_size=1)

        self.branch3x3 = nn.Sequential(
            nn.Conv2d(in_channels, 128, kernel_size=(1, 3), padding=(0, 1)),
            nn.Conv2d(128, 128, kernel_size=(3, 1), padding=(1, 0)),
        )

        self.branch_pool = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels, 128, kernel_size=1),
        )

    def forward(self, x):
        b1 = self.branch1x1(x)
        b2 = self.branch3x3(x)
        b3 = self.branch_pool(x)
        return torch.cat([b1, b2, b3], dim=1)



In [22]:

# patch embedding: split patch -> Linear
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels, patch_size=5, emb_size=16):
        super().__init__()
        self.patch_size = patch_size
        self.emb_size = emb_size
        self.proj = nn.Linear(in_channels * patch_size * patch_size, emb_size)

    def forward(self, x):
        B, C, H, W = x.shape
        x = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
        x = x.permute(0, 2, 3, 1, 4, 5).contiguous()
        x = x.view(B, -1, C * self.patch_size * self.patch_size)
        return self.proj(x)  # shape: (b,num patches,emb size)


In [23]:


# -------- Transformer Encoder Block (ViT block) --------
class TransformerBlock(nn.Module):
    def __init__(self, emb_size=16, dropout=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(emb_size)
        self.attn = nn.MultiheadAttention(emb_size, num_heads=2, batch_first=True)
        self.norm2 = nn.LayerNorm(emb_size)
        self.mlp = nn.Sequential(
            nn.Linear(emb_size, emb_size * 2),
            nn.GELU(),
            nn.Linear(emb_size * 2, emb_size),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x_attn, _ = self.attn(self.norm1(x), self.norm1(x), self.norm1(x))
        x = x + x_attn
        x = x + self.mlp(self.norm2(x))
        return x


In [24]:


# -------- PlantXViT Model --------
class PlantXViT(nn.Module):
    def __init__(self, num_classes=4, patch_size=5, emb_size=16, num_blocks=4, dropout=0.1):
        super().__init__()

        # VGG16 (2 blocks)
        vgg = models.vgg16(weights=VGG16_Weights.DEFAULT)
        self.vgg_block = nn.Sequential(*vgg[:10])  # output: (B, 128, 56, 56)

        # Inception-like block → (B, 384, 56, 56)
        self.inception = InceptionBlock(in_channels=128)

        # Patch Embedding → (B, 121, 16)
        self.patch_embed = PatchEmbedding(in_channels=384, patch_size=patch_size, emb_size=emb_size)

        # Transformer blocks
        self.transformer = nn.Sequential(*[TransformerBlock(emb_size, dropout) for _ in range(num_blocks)])

        # Classification head
        self.norm = nn.LayerNorm(emb_size)
        self.global_pool = nn.AdaptiveAvgPool1d(1)  # (B, emb_size, 1)
        self.classifier = nn.Linear(emb_size, num_classes)

    def forward(self, x):
        x = self.vgg_block(x)  # (B, 128, 56, 56)
        x = self.inception(x)  # (B, 384, 56, 56)
        x = self.patch_embed(x)  # (B, 121, 16)
        x = self.transformer(x)  # (B, 121, 16)
        x = self.norm(x)  # (B, 121, 16)
        x = x.permute(0, 2, 1)  # (B, 16, 121)
        x = self.global_pool(x).squeeze(-1)  # (B, 16)
        return self.classifier(x)  # (B, num_classes)


In [25]:
# test model here
model = PlantXViT(
    num_classes=4,
    patch_size=5,
    emb_size=16,
    num_blocks=4,
    dropout=0.1
)

dummy_input = torch.randn(1, 3, 224, 224)
output = model(dummy_input)
print("Output shape:", output.shape)  # 👉 torch.Size([1, 4])


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:02<00:00, 209MB/s]


Output shape: torch.Size([1, 4])


trying to train here

In [26]:
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / total
    accuracy = correct / total
    return epoch_loss, accuracy


In [27]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    epoch_loss = running_loss / total
    accuracy = correct / total
    return epoch_loss, accuracy


In [30]:
print(config['output']['model_path'])

./outputs/models/plantxvit_best.h5


In [31]:
import os
import torch

def train_model(
    model, train_loader, val_loader,
    criterion, optimizer,
    num_epochs, device,
    save_path="./outputs/models/plantxvit_best.pth"
):
    best_val_loss = float('inf')
    history = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}

    for epoch in range(1, num_epochs + 1):
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)

        print(f"Epoch [{epoch}/{num_epochs}] "
              f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")

        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["train_acc"].append(train_acc)
        history["val_acc"].append(val_acc)

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            torch.save(model.state_dict(), save_path)
            print("✅ Saved best model.")

    return history


In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device.type)

cuda


In [42]:
# from model import PlantXViT  # hoặc copy trực tiếp class từ các cell trên
import torch
import torch.nn as nn

device = torch.device("cuda")

model = PlantXViT(num_classes=4).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)




In [43]:
history = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=50,
    device=device,
    save_path="./outputs/models/plantxvit_best.pth"
)


Epoch [1/50] Train Loss: 1.2122, Acc: 0.4093 | Val Loss: 1.1135, Acc: 0.5260
✅ Saved best model.
Epoch [2/50] Train Loss: 1.0404, Acc: 0.5714 | Val Loss: 1.0010, Acc: 0.6219
✅ Saved best model.
Epoch [3/50] Train Loss: 0.9143, Acc: 0.6641 | Val Loss: 0.9283, Acc: 0.6466
✅ Saved best model.
Epoch [4/50] Train Loss: 0.8451, Acc: 0.6923 | Val Loss: 0.8129, Acc: 0.7315
✅ Saved best model.
Epoch [5/50] Train Loss: 0.7735, Acc: 0.7424 | Val Loss: 0.7609, Acc: 0.7096
✅ Saved best model.
Epoch [6/50] Train Loss: 0.7011, Acc: 0.7589 | Val Loss: 0.6491, Acc: 0.8082
✅ Saved best model.
Epoch [7/50] Train Loss: 0.5787, Acc: 0.8187 | Val Loss: 0.5140, Acc: 0.8630
✅ Saved best model.
Epoch [8/50] Train Loss: 0.4586, Acc: 0.8791 | Val Loss: 0.4547, Acc: 0.8822
✅ Saved best model.
Epoch [9/50] Train Loss: 0.3316, Acc: 0.9210 | Val Loss: 0.3782, Acc: 0.9096
✅ Saved best model.
Epoch [10/50] Train Loss: 0.2677, Acc: 0.9354 | Val Loss: 0.3527, Acc: 0.9068
✅ Saved best model.
Epoch [11/50] Train Loss: 0.2

KeyboardInterrupt: 