In [1]:
!wget http://www.ee.surrey.ac.uk/CVSSP/demos/chars74k/EnglishImg.tgz
!wget http://www.ee.surrey.ac.uk/CVSSP/demos/chars74k/EnglishHnd.tgz
!wget http://www.ee.surrey.ac.uk/CVSSP/demos/chars74k/EnglishFnt.tgz

!tar -xzf EnglishImg.tgz
!tar -xzf EnglishHnd.tgz
!tar -xzf EnglishFnt.tgz


!pip uninstall -y torch torchvision torchaudio clip-by-openai
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
!pip install git+https://github.com/openai/CLIP.git idx2numpy scikit-learn

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import clip
from PIL import Image
import numpy as np
import pandas as pd
import idx2numpy
from google.colab import drive
import os

drive.mount('/content/drive')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

--2025-12-25 09:09:23--  http://www.ee.surrey.ac.uk/CVSSP/demos/chars74k/EnglishImg.tgz
Resolving www.ee.surrey.ac.uk (www.ee.surrey.ac.uk)... 131.227.80.48
Connecting to www.ee.surrey.ac.uk (www.ee.surrey.ac.uk)|131.227.80.48|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://info-ee.surrey.ac.uk/CVSSP/demos/chars74k/EnglishImg.tgz [following]
--2025-12-25 09:09:23--  https://info-ee.surrey.ac.uk/CVSSP/demos/chars74k/EnglishImg.tgz
Resolving info-ee.surrey.ac.uk (info-ee.surrey.ac.uk)... 51.132.210.128
Connecting to info-ee.surrey.ac.uk (info-ee.surrey.ac.uk)|51.132.210.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Cookie coming from info-ee.surrey.ac.uk attempted to set domain to rp-web-iee-01.azurewebsites.net
Cookie coming from info-ee.surrey.ac.uk attempted to set domain to rp-web-iee-01.azurewebsites.net
Length: 133975105 (128M) [application/octet-stream]
Saving to: ‘EnglishImg.tgz’


2025-12-25 09:09:31 (18.0

In [2]:
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()

100%|████████████████████████████████████████| 338M/338M [00:01<00:00, 201MiB/s]


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [3]:
class Chars74K_CLIP(Dataset):
    def __init__(self, root, preprocess):
        self.samples = []
        self.preprocess = preprocess

        # 0–9 + A–Z  → Sample001 … Sample036
        for i in range(1, 37):
            label = i - 1
            folder = f"Sample{str(i).zfill(3)}"
            folder_path = os.path.join(root, folder)

            for fname in os.listdir(folder_path):
                if fname.lower().endswith((".png", ".jpg")):
                    self.samples.append(
                        (os.path.join(folder_path, fname), label)
                    )

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        img = Image.open(path).convert("RGB")
        img = self.preprocess(img)
        return img, torch.tensor(label, dtype=torch.long)


In [4]:
dataset = Chars74K_CLIP(
    root="English/Fnt",
    preprocess=clip_preprocess
)

In [5]:
from sklearn.model_selection import train_test_split

indices = list(range(len(dataset)))
train_idx, temp_idx = train_test_split(indices, test_size=0.2, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)


train_ds = torch.utils.data.Subset(dataset, train_idx)
val_ds   = torch.utils.data.Subset(dataset, val_idx)
test_ds  = torch.utils.data.Subset(dataset, test_idx)


train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=64, shuffle=False)

In [6]:
class CLIPClassifier(nn.Module):
    def __init__(self, num_classes=36):
        super().__init__()
        self.clip_model = clip_model
        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        with torch.no_grad():
            feats = self.clip_model.encode_image(x)
        feats = feats / feats.norm(dim=-1, keepdim=True)
        return self.fc(feats.float())

In [7]:
model = CLIPClassifier().to(device)
optimizer = torch.optim.Adam(model.fc.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [8]:
EPOCHS = 15

for epoch in range(1, EPOCHS + 1):
    # TRAIN
    model.train()
    train_correct, train_total, train_loss_sum = 0, 0, 0.0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        train_loss_sum += loss.item() * xb.size(0)
        train_correct += (logits.argmax(1) == yb).sum().item()
        train_total += xb.size(0)

    train_loss = train_loss_sum / train_total
    train_acc = train_correct / train_total

    # VALIDATION
    model.eval()
    val_correct, val_total, val_loss_sum = 0, 0, 0.0

    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss = criterion(logits, yb)

            val_loss_sum += loss.item() * xb.size(0)
            val_correct += (logits.argmax(1) == yb).sum().item()
            val_total += xb.size(0)

    val_loss = val_loss_sum / val_total
    val_acc = val_correct / val_total


    # EVALUATION
    model.eval()
    test_correct, test_total, test_loss_sum = 0, 0, 0.0

    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss = criterion(logits, yb)

            test_loss_sum += loss.item() * xb.size(0)
            test_correct += (logits.argmax(1) == yb).sum().item()
            test_total += xb.size(0)

    test_loss = test_loss_sum / test_total
    test_acc = test_correct / test_total

    print(
    f"Epoch {epoch} | "
    f"Train accuracy = {train_acc*100:.2f}% | "
    f"Validation accuracy = {val_acc*100:.2f}% | "
    f"Test accuracy = {test_acc*100:.2f}%"
)


Epoch 1 | Train accuracy = 80.39% | Validation accuracy = 91.03% | Test accuracy = 91.31%
Epoch 2 | Train accuracy = 91.31% | Validation accuracy = 91.25% | Test accuracy = 91.85%
Epoch 3 | Train accuracy = 91.95% | Validation accuracy = 91.36% | Test accuracy = 91.91%
Epoch 4 | Train accuracy = 92.23% | Validation accuracy = 91.94% | Test accuracy = 92.24%
Epoch 5 | Train accuracy = 92.53% | Validation accuracy = 92.04% | Test accuracy = 92.24%
Epoch 6 | Train accuracy = 92.76% | Validation accuracy = 92.24% | Test accuracy = 92.70%
Epoch 7 | Train accuracy = 92.98% | Validation accuracy = 92.67% | Test accuracy = 92.81%
Epoch 8 | Train accuracy = 93.11% | Validation accuracy = 92.89% | Test accuracy = 92.97%
Epoch 9 | Train accuracy = 93.40% | Validation accuracy = 92.97% | Test accuracy = 93.14%
Epoch 10 | Train accuracy = 93.57% | Validation accuracy = 93.00% | Test accuracy = 93.25%
Epoch 11 | Train accuracy = 93.70% | Validation accuracy = 93.22% | Test accuracy = 93.41%
Epoch 12

In [9]:
MODEL_PATH = "/content/drive/MyDrive/AIProject/clip_chars74k_36cls.pth"
torch.save(model.state_dict(), MODEL_PATH)
print("Saved model to:", MODEL_PATH)

Saved model to: /content/drive/MyDrive/AIProject/clip_chars74k_36cls.pth
