In [2]:
pip install pandas

Collecting pandas
  Downloading pandas-2.3.1-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.0 MB 3.2 MB/s eta 0:00:04
   --- ------------------------------------ 1.0/11.0 MB 2.5 MB/s eta 0:00:04
   ---- ----------------------------------- 1.3/11.0 MB 2.1 MB/s eta 0:00:05
   ------ --------------------------------- 1.8/11.0 MB 2.1 MB/s eta 0:00:05
   -------- ------------------------------- 2.4/11.0 MB 2.1 MB/s eta 0:00:05
   ----------- ---------------------------- 3.1


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# Environment & imports
import os, sys, json, random, math, time
from pathlib import Path
from collections import Counter, defaultdict

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from PIL import Image

try:
    import torchvision
    from torchvision import transforms
except Exception as e:
    print("torchvision not found; attempting to continue with PIL-only transforms")
    torchvision = None
    transforms = None

ASSETS = Path('Assets')
assert ASSETS.exists(), f"Assets folder not found at {ASSETS.resolve()}"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)


Using device: cpu


In [2]:
# Discover datasets
from typing import List, Tuple

IMG_EXTS = {'.png', '.jpg', '.jpeg', '.bmp', '.gif'}

def list_images_in_dir_of_dirs(root: Path) -> List[Tuple[Path, str]]:
    samples = []
    if not root.exists():
        return samples
    for class_dir in sorted([p for p in root.iterdir() if p.is_dir()]):
        label = class_dir.name
        for p in class_dir.rglob('*'):
            if p.suffix.lower() in IMG_EXTS and p.is_file():
                samples.append((p, label))
    return samples

# 1) augmented_images
aug_dir = ASSETS / 'augmented_images'
aug_samples = list_images_in_dir_of_dirs(aug_dir)
print(f"augmented_images: {len(aug_samples)} samples")

# 2) handwritten-english-characters-and-digits/train and test
hed_root = ASSETS / 'handwritten-english-characters-and-digits'
hed_train = list_images_in_dir_of_dirs(hed_root / 'train')
hed_test = list_images_in_dir_of_dirs(hed_root / 'test')
print(f"HED train: {len(hed_train)} samples, HED test: {len(hed_test)} samples")

# 3) image_labels.csv: filename,label
csv_path = ASSETS / 'image_labels.csv'
if not csv_path.exists():
    alt_csv_path = ASSETS / 'image_label.csv'  # support alternate name
    if alt_csv_path.exists():
        csv_path = alt_csv_path

csv_samples = []
if csv_path.exists():
    df = pd.read_csv(csv_path)
    assert {'filename','label'}.issubset(df.columns)
    # Index all images under Assets for filename lookup
    all_imgs = {p.name: p for p in ASSETS.rglob('*') if p.suffix.lower() in IMG_EXTS}
    missing = 0
    for _, row in df.iterrows():
        fname = str(row['filename'])
        label = str(row['label'])
        if fname in all_imgs:
            csv_samples.append((all_imgs[fname], label))
        else:
            # Try to search by suffix match if duplicates are unlikely
            matches = [p for n,p in all_imgs.items() if n.endswith(fname)]
            if len(matches) == 1:
                csv_samples.append((matches[0], label))
            else:
                missing += 1
    print(f"CSV samples resolved from {csv_path.name}: {len(csv_samples)} (missing: {missing})")
else:
    print("image_labels.csv not found; skipping CSV source")

# Merge all; if duplicates appear, prefer explicit CSV labels > hed > aug
# Use image absolute path as key
merged = {}
for p,l in aug_samples:
    merged[str(p.resolve())] = (p, l)
for p,l in hed_train + hed_test:
    merged[str(p.resolve())] = (p, l)
for p,l in csv_samples:
    merged[str(p.resolve())] = (p, l)

all_samples = list(merged.values())
print(f"Total unique samples: {len(all_samples)}")

# Map labels to indices
labels = sorted(sorted({l for _, l in all_samples}))
label_to_idx = {l:i for i,l in enumerate(labels)}
idx_to_label = {i:l for l,i in label_to_idx.items()}
print(f"Num classes: {len(labels)}")

# Class distribution
counts = Counter([l for _,l in all_samples])
print("Class distribution (top 20):", counts.most_common(20))


augmented_images: 13640 samples
HED train: 2728 samples, HED test: 682 samples
CSV samples resolved from image_labels.csv: 0 (missing: 13640)
Total unique samples: 17050
Num classes: 62
Class distribution (top 20): [('0', 275), ('1', 275), ('2', 275), ('3', 275), ('4', 275), ('5', 275), ('6', 275), ('7', 275), ('8', 275), ('9', 275), ('a', 275), ('A_caps', 275), ('b', 275), ('B_caps', 275), ('c', 275), ('C_caps', 275), ('d', 275), ('D_caps', 275), ('e', 275), ('E_caps', 275)]


In [3]:
# Train/Val/Test split (stratified)
from sklearn.model_selection import train_test_split

random_seed = 42
rng = np.random.RandomState(random_seed)

paths = np.array([str(p) for p,_ in all_samples])
labels_arr = np.array([label_to_idx[l] for _,l in all_samples])

# If HED has an explicit test set, we already included it. We'll still split overall
# into train/val/test=0.8/0.1/0.1 stratified.
X_temp, X_test, y_temp, y_test = train_test_split(
    paths, labels_arr, test_size=0.1, random_state=random_seed, stratify=labels_arr
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1111, random_state=random_seed, stratify=y_temp
)  # 0.8 * 0.1111 ≈ 0.0889 so final ≈ 80/10/10

print(len(X_train), len(X_val), len(X_test))


13640 1705 1705


In [4]:
# Transforms and Dataset

IMG_SIZE = 64  # can be tuned

if transforms is None:
    # Minimal transforms using PIL + numpy
    class ToTensor:
        def __call__(self, img):
            arr = np.array(img, dtype=np.float32) / 255.0
            if arr.ndim == 2:
                arr = arr[..., None]
            arr = arr.transpose(2,0,1)
            return torch.from_numpy(arr)
    basic_train_tfms = None
    basic_val_tfms = None
else:
    basic_train_tfms = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.RandomAffine(degrees=5, translate=(0.02,0.02), scale=(0.95,1.05)),
        transforms.RandomPerspective(distortion_scale=0.2, p=0.3),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5]),
    ])
    basic_val_tfms = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5]),
    ])

class HandwritingDataset(Dataset):
    def __init__(self, paths: np.ndarray, labels: np.ndarray, transform=None):
        self.paths = paths
        self.labels = labels
        self.transform = transform
    def __len__(self):
        return len(self.paths)
    def __getitem__(self, idx):
        p = Path(self.paths[idx])
        y = int(self.labels[idx])
        with Image.open(p) as img:
            img = img.convert('L')  # ensure single-channel
            if basic_train_tfms is None and transforms is None:
                # fallback: resize via PIL then tensor
                img = img.resize((IMG_SIZE, IMG_SIZE))
                arr = np.array(img, dtype=np.float32)/255.0
                arr = arr[None, ...]
                x = torch.from_numpy(arr)
            else:
                x = (self.transform or basic_val_tfms)(img)
        return x, y


In [5]:
# DataLoaders
batch_size = 128
num_workers = 2 if os.name != 'nt' else 0  # Windows pytorch dataloader workers

train_ds = HandwritingDataset(X_train, y_train, transform=basic_train_tfms)
val_ds   = HandwritingDataset(X_val, y_val, transform=basic_val_tfms)
test_ds  = HandwritingDataset(X_test, y_test, transform=basic_val_tfms)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers)

len(train_ds), len(val_ds), len(test_ds)


(13640, 1705, 1705)

In [6]:
# Model: Small CNN
class SmallCNN(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 32x32

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 16x16

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 8x8
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(128 * (IMG_SIZE//8) * (IMG_SIZE//8), 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

num_classes = len(labels)
model = SmallCNN(num_classes).to(device)

sum(p.numel() for p in model.parameters() if p.requires_grad), model


(2206462,
 SmallCNN(
   (features): Sequential(
     (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (2): ReLU(inplace=True)
     (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (6): ReLU(inplace=True)
     (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (10): ReLU(inplace=True)
     (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
   )
   (classifier): Sequential(
     (0): Dropout(p=0.3, inplace=False)
     (1): Linear(in_features=8192, 

In [7]:
# Training utilities
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def evaluate(model, loader, device):
    model.eval()
    ys, ys_pred = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)
            logits = model(xb)
            preds = logits.argmax(dim=1)
            ys.extend(yb.cpu().numpy().tolist())
            ys_pred.extend(preds.cpu().numpy().tolist())
    acc = accuracy_score(ys, ys_pred)
    return acc, np.array(ys), np.array(ys_pred)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2, factor=0.5)

epochs = 15
best_val_acc = 0.0
save_dir = Path('artifacts')
save_dir.mkdir(exist_ok=True)
model_path = save_dir / 'handwriting_cnn.pt'
labels_path = save_dir / 'labels.json'

with open(labels_path, 'w') as f:
    json.dump(idx_to_label, f, indent=2)

for epoch in range(1, epochs+1):
    model.train()
    running_loss = 0.0
    num_batches = 0
    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        num_batches += 1
    train_loss = running_loss / max(1, num_batches)

    val_acc, _, _ = evaluate(model, val_loader, device)
    scheduler.step(val_acc)

    print(f"Epoch {epoch:02d}/{epochs} - loss: {train_loss:.4f} - val_acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({'model_state': model.state_dict(), 'config': {'img_size': IMG_SIZE, 'num_classes': num_classes}}, model_path)
        print(f"Saved new best model to {model_path} (val_acc={best_val_acc:.4f})")

print('Best val acc:', best_val_acc)




Epoch 01/15 - loss: 4.1574 - val_acc: 0.0223
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.0223)




Epoch 02/15 - loss: 3.9826 - val_acc: 0.0581
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.0581)




Epoch 03/15 - loss: 3.7718 - val_acc: 0.0933
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.0933)




Epoch 04/15 - loss: 3.5829 - val_acc: 0.1097
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.1097)




Epoch 05/15 - loss: 3.4512 - val_acc: 0.0962




Epoch 06/15 - loss: 3.3581 - val_acc: 0.1443
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.1443)




Epoch 07/15 - loss: 3.2953 - val_acc: 0.1830
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.1830)




Epoch 08/15 - loss: 3.2372 - val_acc: 0.2094
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.2094)




Epoch 09/15 - loss: 3.1941 - val_acc: 0.2475
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.2475)




Epoch 10/15 - loss: 3.1567 - val_acc: 0.2311




Epoch 11/15 - loss: 3.1078 - val_acc: 0.2780
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.2780)




Epoch 12/15 - loss: 3.0871 - val_acc: 0.3202
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.3202)




Epoch 13/15 - loss: 3.0733 - val_acc: 0.2481




Epoch 14/15 - loss: 3.0548 - val_acc: 0.2545




Epoch 15/15 - loss: 3.0345 - val_acc: 0.3267
Saved new best model to artifacts\handwriting_cnn.pt (val_acc=0.3267)
Best val acc: 0.32668621700879763


In [8]:
# Evaluation on test set
# Load best model (if saved during training)
if model_path.exists():
    ckpt = torch.load(model_path, map_location=device)
    model.load_state_dict(ckpt['model_state'])

test_acc, y_true, y_pred = evaluate(model, test_loader, device)
print(f"Test accuracy: {test_acc:.4f}")

print("Classification report:")
print(classification_report(y_true, y_pred, target_names=[idx_to_label[i] for i in range(num_classes)]))

cm = confusion_matrix(y_true, y_pred)
print("Confusion matrix shape:", cm.shape)

# Save per-image predictions
preds_path = save_dir / 'test_predictions.csv'
pd.DataFrame({
    'path': list(X_test),
    'true_label': [idx_to_label[int(i)] for i in y_true],
    'pred_label': [idx_to_label[int(i)] for i in y_pred],
}).to_csv(preds_path, index=False)
print(f"Saved predictions to {preds_path}")


Test accuracy: 0.3273
Classification report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        27
           1       0.25      0.25      0.25        28
           2       0.00      0.00      0.00        28
           3       0.36      1.00      0.53        27
           4       0.15      0.11      0.13        27
           5       0.43      0.44      0.44        27
           6       0.43      0.56      0.48        27
           7       0.86      0.21      0.34        28
           8       0.24      0.56      0.34        27
           9       0.22      0.52      0.30        27
      A_caps       0.09      0.07      0.08        28
      B_caps       0.36      0.61      0.45        28
      C_caps       0.45      0.93      0.60        27
      D_caps       0.51      0.70      0.59        27
      E_caps       0.67      0.29      0.40        28
      F_caps       1.00      0.15      0.26        27
      G_caps       0.00      0.00      0.00        27
      H_caps       0.00    

In [9]:
# Inference helper
class InferenceModel:
    def __init__(self, model_path: Path, labels_path: Path):
        with open(labels_path, 'r') as f:
            self.idx_to_label = {int(k):v for k,v in json.load(f).items()}
        ckpt = torch.load(model_path, map_location=device)
        cfg = ckpt.get('config', {})
        self.img_size = cfg.get('img_size', IMG_SIZE)
        num_classes = cfg.get('num_classes', len(self.idx_to_label))
        self.model = SmallCNN(num_classes).to(device)
        self.model.load_state_dict(ckpt['model_state'])
        self.model.eval()
        self.tfms = basic_val_tfms
    @torch.no_grad()
    def predict(self, image_path: Path):
        with Image.open(image_path) as img:
            img = img.convert('L')
            x = self.tfms(img).unsqueeze(0).to(device)
        logits = self.model(x)
        pred = logits.argmax(dim=1).item()
        return self.idx_to_label[pred]

# Example (uncomment and set a path)
# infer = InferenceModel(model_path, labels_path)
# print(infer.predict(Path('Assets/some_image.png')))
