In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import timm
import albumentations as albu
from albumentations.pytorch import ToTensorV2

from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import AdamW, Adam
from torch.optim.lr_scheduler import StepLR

from sklearn.metrics import f1_score
from tqdm import tqdm
import copy

import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models
import albumentations as albu
from albumentations.pytorch import ToTensorV2

from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [None]:
class CustomDataset(Dataset):
    """
    A custom Dataset for hieroglyph images labeled by Gardiner number.

    Args:
        df (pd.DataFrame): DataFrame with columns ['file_name', 'label'].
        data_dir (str): Directory where the image files are stored.
        transform (albu.Compose, optional): Albumentations transform to apply.

    Returns:
        (image, label): Transformed image (as a torch.Tensor) and the corresponding label.
    """
    def __init__(self, df, data_dir, transform=None):
        super().__init__()
        self.data_dir = data_dir
        self.transform = transform

        # Pre-extract image paths and labels for speed
        self.image_paths = [
            os.path.join(self.data_dir, fname) for fname in df['file_name'].values
        ]
        self.labels = df['label'].values

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # 1. Load image
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')
        image = np.array(image)  # Convert PIL image to NumPy

        # 2. Apply transforms
        if self.transform is not None:
            image = self.transform(image=image)['image']

        # 3. Fetch label
        label = self.labels[idx]

        return image, label


In [None]:
BATCH_SIZE = 32
SEED = 42
IMG_SIZE = 224

# ======================
# 3. Create DataFrame from PNG files
# ======================
data_dir = "/Users/aayankhare/Desktop/D-en-ominators/datasets/archaeohack-starterpack/data/utf-pngs"

# Get all PNG files and extract labels from filenames
image_files = [f for f in os.listdir(data_dir) if f.endswith('.png')]
image_files.sort()  # Sort for reproducibility

# Extract Gardiner number (label) from filename (e.g., "A1.png" -> "A1")
labels = [os.path.splitext(f)[0] for f in image_files]

# Create DataFrame
train_df = pd.DataFrame({
    'file_name': image_files,
    'label': labels
})

# Create label encoder to convert Gardiner numbers to numeric labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_df['label_encoded'] = label_encoder.fit_transform(train_df['label'])

# Get number of unique classes
num_classes = len(label_encoder.classes_)
print(f"Number of unique classes (Gardiner numbers): {num_classes}")
print(f"Total images: {len(train_df)}")

# Train/val/test split
# Note: Cannot use stratified split since each class has only 1 sample
# Using regular random split instead
train_df, val_df = train_test_split(
    train_df,
    test_size=0.3,
    random_state=SEED
)

val_df, test_df = train_test_split(
    val_df,
    test_size=0.5,
    random_state=SEED
)

# Update labels to use encoded values
train_df['label'] = train_df['label_encoded']
val_df['label'] = val_df['label_encoded']
test_df['label'] = test_df['label_encoded']

# ======================
# 4. Define Transforms
# ======================
train_transform = albu.Compose([
    albu.HorizontalFlip(p=0.5),
    albu.Resize(IMG_SIZE, IMG_SIZE),
    albu.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

val_transform = albu.Compose([
    albu.Resize(IMG_SIZE, IMG_SIZE),
    albu.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

test_transform = albu.Compose([
    albu.Resize(IMG_SIZE, IMG_SIZE),
    albu.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])


# ======================
# 5. Create Datasets & Dataloaders
# ======================
train_data_dir = data_dir

train_dataset = CustomDataset(
    df=train_df,
    data_dir=train_data_dir,
    transform=train_transform
)

val_dataset = CustomDataset(
    df=val_df,
    data_dir=train_data_dir,
    transform=val_transform
)

test_dataset = CustomDataset(
    df=test_df,
    data_dir=train_data_dir,
    transform=test_transform
)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0  # Set to 0 to avoid multiprocessing issues in Jupyter notebooks
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0  # Set to 0 to avoid multiprocessing issues in Jupyter notebooks
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0  # Set to 0 to avoid multiprocessing issues in Jupyter notebooks
)

# ======================
# 6. Quick Sanity Check
# ======================
print("Train Batch:")
for images, labels in train_dataloader:
    print(" Images shape:", images.shape)
    print(" Labels:", labels)
    break

print("\nValidation Batch:")
for images, labels in val_dataloader:
    print(" Images shape:", images.shape)
    print(" Labels:", labels)
    break

print("\\Test Batch:")
for images, labels in test_dataloader:
    print(" Images shape:", images.shape)
    print(" Labels:", labels)
    break

In [None]:
import timm

class ViTClassifier(nn.Module):
    def __init__(self, model_name="vit_base_patch16_224", num_classes=2, pretrained=True):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.model.head.in_features
        self.model.head = nn.Linear(in_features, num_classes)

    def forward(self, x):
        return self.model(x)

model_name = "vit_base_patch16_224"
# num_classes is set from the data above

model = ViTClassifier(model_name=model_name, num_classes=num_classes, pretrained=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded on:", device)
print(f"Number of classes: {num_classes}")

In [None]:
criterion = nn.CrossEntropyLoss()

optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20)

In [None]:
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in tqdm(loader, desc="Training", leave=False):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

def validate_one_epoch(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Validating", leave=False):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

def test_one_epoch(model, loader, criterion, device):

    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Testing", leave=False):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

In [None]:
EPOCHS = 20
best_val_acc = 0.0
patience = 4
wait = 0


for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")

    train_loss, train_acc = train_one_epoch(model, train_dataloader, criterion, optimizer, device)

    val_loss, val_acc = validate_one_epoch(model, val_dataloader, criterion, device)

    scheduler.step()

    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), f"VIT-{val_acc:.4f}.pth")
        print(" --> Best model saved!")
        wait = 0
    else:
        wait += 1
        print(f" --> No improvement. Patience counter: {wait}/{patience}")

        if wait >= patience:
            print("Early stopping triggered! Training stopped.")
            break

    if (epoch + 1) % 5 == 0:
        test_loss, test_acc = test_one_epoch(model, test_dataloader, criterion, device)
        print(f" [Test @ Epoch {epoch+1}] Loss: {test_loss:.4f} | Acc: {test_acc:.4f}")

In [None]:
# Test dataset is already created above using test_df
# We can use the same test_dataset and test_dataloader from cell 3

print("Test DataFrame head:\n", test_df.head())

print(f"\nNumber of test samples: {len(test_dataset)}")

# Quick check
for images, labels in test_dataloader:
    print("Test Batch - Images shape:", images.shape)
    print("Test Batch - Labels:", labels[:10])  # Show first 10 labels
    break

In [None]:
best_model_path = f"VIT-{best_val_acc:.4f}.pth"
model.load_state_dict(torch.load(best_model_path, map_location=device, weights_only=False))

model.eval()
print(f"Loaded best model weights from: {best_model_path}")

all_predictions = []

with torch.no_grad():
    for images, _ in tqdm(test_dataloader, desc="Predicting on Test Set"):
        images = images.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        all_predictions.extend(preds.cpu().numpy())


In [None]:
# Convert predictions back to Gardiner numbers
predicted_gardiner_nums = label_encoder.inverse_transform(all_predictions)

submission_df = pd.DataFrame({
    "file_name": test_df["file_name"],
    "predicted_gardiner_num": predicted_gardiner_nums,
    "predicted_label": all_predictions
})

submission_df

In [None]:
# @title label distribution

from matplotlib import pyplot as plt
submission_df['predicted_label'].plot(kind='hist', bins=min(50, num_classes), title='Predicted Label Distribution')
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.xlabel('Class Label')
plt.ylabel('Frequency')

In [None]:
submission_df.to_csv('hieroglyph_predictions.csv', index=False)
print("Predictions saved to hieroglyph_predictions.csv")